In [None]:
# Import packages and setup gpu configuration.
# This code block shouldnt need to be adjusted!
import os
import sys
import json
import yaml
import numpy as np
import pandas as pd
import copy
import math
from einops import rearrange
from einops.layers.torch import Rearrange
import time
import random
import h5py
import webdataset as wds
import gc
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
import utils
from models import *
import nibabel as nib
from nilearn import plotting

import schedulefree

# tf32 data type is faster than standard float32
torch.backends.cuda.matmul.allow_tf32 = True
# if utils.is_interactive():
#     raise ValueError()

### Multi-GPU config ###
device_count = torch.cuda.device_count()
print(f"Number of available CUDA devices: {device_count}")

local_rank = os.getenv('LOCAL_RANK')
if local_rank is None: 
    local_rank = 0
else:
    local_rank = int(local_rank)
print(f"LOCAL RANK={local_rank}")

num_devices = os.getenv('NUM_GPUS')
if num_devices is None: 
    num_devices = 1
else:
    num_devices = int(num_devices)
print(f"NUM GPUS={num_devices}")
distributed = True if num_devices>1 else False
if distributed: assert device_count==num_devices

node = os.getenv('SLURM_NODEID')
if node is None:
    node = 0
else:
    node = int(node)
print(f"NODE={node}")

global_rank = os.getenv('RANK')
if global_rank is None:
    global_rank = 0
else:
    global_rank = int(global_rank)
print(f"GLOBAL RANK={global_rank}")

world_size = os.getenv('WORLD_SIZE')
if world_size is None: 
    world_size = 1
else:
    world_size = int(world_size)
print(f"WORLD_SIZE={world_size}")

from tqdm import tqdm

# Load parameters from yaml config
config = yaml.load(open('config.yaml', 'r'), Loader=yaml.FullLoader)

# create global variables from the config
for attribute_name in config.keys():
    globals()[attribute_name] = config[f'{attribute_name}']

data_type = torch.float32 # change depending on your mixed_precision
# batch_size = global_batch_size // num_devices
global_batch_size = batch_size * world_size

# FSDP Setup
if distributed:
    import torch.distributed as dist
    import torch.multiprocessing as mp
    from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
    from torch.distributed.fsdp.api import BackwardPrefetch, CPUOffload, ShardingStrategy
    import functools
    from torch.distributed.fsdp.wrap import size_based_auto_wrap_policy, transformer_auto_wrap_policy
    print("starting init_process_group...")
    dist.init_process_group("nccl", rank=global_rank, world_size=world_size)
    print(f"setting device to cuda:{local_rank}")
    try:
        torch.cuda.set_device(local_rank)
        device = torch.device('cuda',local_rank)
        print(f"\nSuccessfully set cuda:{local_rank} | global_rank{global_rank} | node{node}")
    except Exception as error:        
        print(f"\nFAILED TO SET DEVICE cuda:{local_rank} | global_rank{global_rank} | node{node}")
        print("An exception occurred:", error)
        
else:
    device = torch.device('cuda')

print("PID of this process =",os.getpid())
print("device =", device, "distributed =",distributed, "num_devices =", num_devices, "local rank =", local_rank, "world size =", world_size, "data_type =", data_type)


print(config)

# seed all random functions
utils.seed_everything(seed)

outdir = os.path.abspath(f'../ckpts/{model_name}')
os.makedirs(outdir,exist_ok=True)
print("outdir", outdir)
print("global_batch_size", global_batch_size)
print("use_cls_token", use_cls_token)

if type(patch_size) == int:
    patch_size = [patch_size,patch_size,patch_size]
patch_depth = patch_size[0]
patch_height = patch_size[1]
patch_width = patch_size[2]

num_patches = int(
    (img_size[0] / patch_depth)
    * (img_size[1] / patch_height)
    * (img_size[2] / patch_width)
    * num_frames
)
num_patches_per_timepoint = num_patches // frame_patch_size
num_encoder_patches = int(np.floor((num_patches_per_timepoint * num_frames // frame_patch_size) * (1 - tube_start_masking_ratio)))
num_decoder_patches = int(np.floor((num_patches_per_timepoint * num_frames  // frame_patch_size) * (1 - decoder_mask_ratio)))
print("num_patches", num_patches)
print("num_patches_per_timepoint", num_patches_per_timepoint)
print("num_encoder_patches", num_encoder_patches)
print("num_decoder_patches", num_decoder_patches)


vit_size = {
    "encoder": encoder_model,
    "decoder": decoder_model
}
    
model = get_vit(
    size=vit_size,
    image_size=img_size,  # depth, height, width
    image_patch_size=(patch_depth,patch_height,patch_width),  # depth, height, width patch size
    frames=num_frames,
    frame_patch_size=frame_patch_size,
    channels=1,
    use_rope_emb=use_rope_emb,
    use_cls_token=use_cls_token,
)
utils.count_params(model)

# function to select random num_frames from sample and obtain brain-positive patches
aug_transform = utils.DataPrepper(
    num_frames=num_frames*2,
    masking_strategy=masking_strategy,
    patch_depth=patch_depth,
    patch_height=patch_height,
    patch_width=patch_width,
    frame_patch_size=frame_patch_size,
)

# test that the model works without error
model = model.to(device)
encoder_mask = torch.zeros(num_patches_per_timepoint).to(torch.bool)
encoder_mask[:num_encoder_patches] = True
decoder_mask = torch.zeros(num_patches_per_timepoint).to(torch.bool)
decoder_mask[-num_decoder_patches:] = True
decoder_mask[encoder_mask] = False
with torch.no_grad():
    print("\nencoder")
    encoder_out = model(
                torch.randn(batch_size, 1, num_frames, img_size[0], img_size[1], img_size[2]).to(device),
                encoder_mask=encoder_mask,
                verbose=True)
    if use_decoder:
        print("\ndecoder")
        decoder_out = model(
                    encoder_out, 
                    encoder_mask=encoder_mask, 
                    decoder_mask=decoder_mask, 
                    verbose=True)
    if use_cls_token:
        enc_cls_token = encoder_out[:, :1, :]
        encoder_patches = encoder_out[:, 1:, :]
        print("\nenc_cls_token", enc_cls_token.shape)
        print("encoder_patches", encoder_patches.shape)
        if use_decoder:
            dec_cls_token = decoder_out[:, :1, :]
            decoder_patches = decoder_out[:, 1:, :]
            print("dec_cls_token", dec_cls_token.shape)
            print("decoder_patches", decoder_patches.shape)

class LinearProbe(nn.Module):
    def __init__(self, input_dim, h=256, num_classes=8):
        super(LinearProbe, self).__init__()
        # self.classifier = nn.Linear(input_dim, num_classes)
        self.classifier = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.GELU(),
            nn.Linear(input_dim, h),
            nn.LayerNorm(h),
            nn.GELU(),
            nn.Linear(h, h),
            nn.LayerNorm(h),
            nn.GELU(),
            nn.Linear(h, num_classes)
        )
    def forward(self, x):
        x = self.classifier(x)
        return x

def log_and_continue(exn):
    """Call in an exception handler to ignore any exception, issue a warning, and continue."""
    print(f'Handling webdataset error ({repr(exn)}). Ignoring.')
    return True

def filter_corrupted_images(sample):
    """If all the required files are not present don't use them."""
    correct_data = ("func.npy" in sample)
    return correct_data

### ================      Train Dataset and DataLoader    ====================
from braceexpand import braceexpand
print(train_urls)
if is_s3:
    expanded_urls = [f"pipe:aws s3 cp {url} -" for pattern in train_urls for url in braceexpand(pattern)]
else:
    expanded_urls = [str(url) for pattern in train_urls for url in braceexpand(pattern)]

train_data = (
    wds.WebDataset(expanded_urls, resampled=True, nodesplitter=wds.split_by_node, handler=log_and_continue)
    .shuffle(100, initial=100, rng=random.Random(seed))
    .select(filter_corrupted_images)
    .decode("torch")
)
train_dl = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=False, drop_last=True, pin_memory=True)

### ================      Test Dataset and DataLoader    ====================
print(test_urls)
if is_s3:
    expanded_urls = [f"pipe:aws s3 cp {url} -" for pattern in test_urls for url in braceexpand(pattern)]
else:
    expanded_urls = [str(url) for pattern in train_urls for url in braceexpand(pattern)]

test_data = (
    wds.WebDataset(expanded_urls, resampled=True, nodesplitter=wds.split_by_node, handler=log_and_continue)
    .shuffle(100, initial=100, rng=random.Random(seed))
    .select(filter_corrupted_images)
    .decode("torch")
)
test_dl = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False, drop_last=True, pin_memory=True)


if distributed:    
    # my_auto_wrap_policy = functools.partial(
    #     size_based_auto_wrap_policy, min_num_params=200000
    # )
    my_auto_wrap_policy = functools.partial(
        transformer_auto_wrap_policy, 
        transformer_layer_cls={
            Attention, # <--- Your Transformer layer class
        },
    )
    print(f"\nPrepping FSDP on {global_rank} {node}...\n")
    model = model.to(device)
    model = FSDP(
        model,
        sharding_strategy=ShardingStrategy.HYBRID_SHARD,
        auto_wrap_policy=my_auto_wrap_policy,
        use_orig_params=False,
        cpu_offload=None, #CPUOffload(offload_params=True)
        sync_module_states=True,
        limit_all_gathers=True, # See https://github.com/pytorch/pytorch/issues/91165
        device_id=device,
    )
    print(f"\nSuccessfully loaded FSDP model to device on global_rank {global_rank}\n")
    dist.barrier()
    print(f"\nSuccessfully loaded FSDP model to device on global_rank {global_rank}\n")


no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
opt_grouped_parameters = [
    {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 1e-2},
    {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
]

if distributed:
    max_lr = max_lr * global_batch_size
    print(f"multiply lr {max_lr} by global batch size: max_lr={max_lr}")

optimizer = torch.optim.AdamW(opt_grouped_parameters, lr=max_lr)
# optimizer = schedulefree.AdamWScheduleFree(opt_grouped_parameters, lr=max_lr)

num_iterations_per_epoch = num_samples_per_epoch // global_batch_size
print("num_iterations_per_epoch", num_iterations_per_epoch)

probe_num_iterations_per_epoch = test_num_samples_per_epoch // global_batch_size
print("probe_num_iterations_per_epoch", probe_num_iterations_per_epoch)

total_steps = num_epochs * num_iterations_per_epoch * num_devices
print("total_steps", total_steps)

lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
    optimizer,
    max_lr=max_lr,
    total_steps=total_steps,
    pct_start=10*num_iterations_per_epoch/total_steps,
    div_factor=25,
    final_div_factor=1000
)

print("\nDone with model preparations!")
num_params = utils.count_params(model)


def save_ckpt(model,tag="last"):
    if distributed: dist.barrier()
    model_states = model.state_dict()
    if global_rank == 0:
        os.makedirs(outdir,exist_ok=True)
        ckpt_path = outdir+f'/{tag}.pth'
        torch.save({
            'epoch': epoch,
            'model_state_dict': model_states,
            'optimizer_state_dict': optimizer.state_dict(),
        }, ckpt_path)
        print(f"\n---saved {ckpt_path}!---\n")


if utils.is_interactive():
#     wandb_log = False
    ckpt_saving = False
if local_rank==0 and wandb_log: # only use main process for wandb logging
    import wandb
    wandb_project = 'fmri_foundation'
    print(f"wandb {wandb_project} run {model_name}")
    # need to configure wandb beforehand in terminal with "wandb init"!
    wandb_config = {
      "model_name": model_name,
      "global_batch_size": global_batch_size,
      "batch_size": batch_size,
      "num_epochs": num_epochs,
      "num_samples_per_epoch": num_samples_per_epoch,
      "test_num_samples_per_epoch": test_num_samples_per_epoch,
      "num_iterations_per_epoch": num_iterations_per_epoch,
      "encoder_model": encoder_model,
      "decoder_model": decoder_model,
      "tube_start_masking_ratio": tube_start_masking_ratio,
      "tube_end_masking_ratio": tube_end_masking_ratio,
      "decoder_mask_ratio": decoder_mask_ratio,
      "num_frames": num_frames,
      "patch_size": patch_size,
      "frame_patch_size": frame_patch_size,
      "use_contrastive_loss": use_contrastive_loss,
      "use_cls_token": use_cls_token,
      "contrastive_loss_weight": contrastive_loss_weight,
      "num_params": num_params,
      "max_lr": max_lr,
      "ckpt_interval": ckpt_interval,
      "ckpt_saving": ckpt_saving,
      "seed": seed,
      "distributed": distributed,
      "num_devices": num_devices,
      "world_size": world_size,
      "train_urls": train_urls,
    }
    print("wandb_config:\n",wandb_config)
    print("wandb_id:",model_name)
    wandb.init(
        id=model_name,
        project=wandb_project,
        name=model_name,
        config=wandb_config,
        resume="allow",
    )
else:
    wandb_log = False


epoch = 0
lrs, train_losses, recon_losses, contrastive_losses = [], [], [], []
cos_sim_encoder_output, cos_sim_decoder_output, cos_sim_encoder_output_patchwise = [], [], []
probe_losses, probe_accs, test_losses, test_accs = [], [], [], []


if masking_strategy=="MNI":
    from einops.layers.torch import Rearrange
    MNI_brain = nib.load("/weka/proj-fmri/paulscotti/fMRI-foundation-model/dataset_creation/afni_conversion/tpl-MNI152NLin2009cAsym_res-02_T1w_brain.nii.gz").get_fdata()
    brain_pos_voxels = MNI_brain[6:94,8:112,10:82]
    brain_pos_pats = Rearrange(
            "b c (f pf) (d pd) (h ph) (w pw) -> b f d h w (pd ph pw pf c)",
            pd=patch_depth,
            ph=patch_height,
            pw=patch_width,
            pf=1,
        )(torch.Tensor(brain_pos_voxels)[None,None,None])
    brain_pos_pats_vit = rearrange(brain_pos_pats, "b ... d -> b (...) d").mean(-1)[0]


mse = nn.MSELoss()
l1 = nn.L1Loss()
crossentropy = nn.CrossEntropyLoss()
if use_contrastive_loss:
    contrastive_temps = utils.cosine_anneal(0.004, 0.0075, num_epochs)
progress_bar = tqdm(range(epoch, num_epochs), disable=local_rank!=0, desc="Overall")
for epoch in progress_bar:
    # get the masking ratio for the current epoch
    tube_mask_ratio = utils.get_masking_ratio(
        current_epoch=epoch,
        total_epochs=num_epochs,
        start_masking_ratio=tube_start_masking_ratio,
        end_masking_ratio=tube_end_masking_ratio
    )
    with torch.cuda.amp.autocast(dtype=data_type):
        model.train()
        # optimizer.train()
        for train_i, batch in enumerate(train_dl):
            optimizer.zero_grad()

            input_func = batch['func.npy']

            subject_id = batch['subject_id.txt']
            subject_id = torch.Tensor([int(subject[-2:]) for subject in subject_id]).long()
            subject_id = torch.repeat_interleave(subject_id.long(), 2).to(device)
            # session_id = batch['session_id.txt']
            # session_id = torch.Tensor([int(session[-2:]) for session in session_id]).long().repeat(2).to(device)
            # session_id = torch.repeat_interleave(session_id.long(), 2)

            if masking_strategy=="None":
                func, _ = aug_transform(input_func)
                brain_pos_pats_vit = torch.ones(num_patches_per_timepoint)
            elif masking_strategy=="MNI":
                func, _ = aug_transform(input_func)
            else:
                func, brain_pos_voxels = aug_transform(input_func)
                brain_pos_pats = model.patchify(torch.Tensor(brain_pos_voxels)[None,None,None])
                brain_pos_pats_vit = rearrange(brain_pos_pats, "b ... d -> b (...) d").mean(-1)[0]

            func = func.reshape(-1, num_frames, 
                                func.shape[-3], func.shape[-2], func.shape[-1])
            func = func.unsqueeze(1).float().to(device).clamp(0,1)
            
            # create encoder and decoder masks
            rand_patches = torch.randperm(num_patches_per_timepoint)
            
            # encoder_mask1 = torch.zeros(num_patches_per_timepoint).to(torch.bool)
            # encoder_mask1[rand_patches[:num_encoder_patches]] = True
            # encoder_mask1 = encoder_mask1.tile(num_frames//frame_patch_size)

            # encoder_mask2 = torch.zeros(num_patches_per_timepoint).to(torch.bool)
            # encoder_mask2[rand_patches[num_encoder_patches:2*num_encoder_patches]] = True
            # encoder_mask2 = encoder_mask2.tile(num_frames//frame_patch_size)

            encoder_mask = torch.zeros(num_patches_per_timepoint).to(torch.bool)
            encoder_mask[rand_patches[:num_encoder_patches]] = True
            encoder_mask = encoder_mask.tile(num_frames//frame_patch_size)
            
            decoder_mask = torch.zeros(num_patches_per_timepoint).to(torch.bool)
            decoder_mask[rand_patches[2*num_encoder_patches:2*num_encoder_patches+num_decoder_patches]] = True
            decoder_mask = decoder_mask.tile(num_frames//frame_patch_size)

            # encode the tube patches
            # encoder_out1 = model(func, encoder_mask=encoder_mask1, device=device)
            # encoder_out2 = model(func, encoder_mask=encoder_mask2, device=device)
            # if use_cls_token:
            #     enc_cls_token1 = encoder_out1[:,:1,:]
            #     enc_cls_token2 = encoder_out2[:,:1,:]
            
            encoder_out = model(func, encoder_mask=encoder_mask, device=device)
            if use_cls_token:
                enc_cls_token = encoder_out[:,:1,:]

            if use_decoder:
                # decode both the encoder_out patches and masked decoder patches
                decoder_out = model(encoder_out, encoder_mask=encoder_mask, decoder_mask=decoder_mask, device=device)
                # subset only the reconstructed decoder patches
                output = decoder_out[:, -decoder_mask.sum():]

                # compare to ground truth and calculate loss
                target_patches = model.patchify(func)
                target_patches_vit = rearrange(target_patches, "b ... d -> b (...) d")
                target = target_patches_vit.to(device)[:, decoder_mask]

                target_mean = target.mean(0)
                target_std = target.std(0)
                target_normed = (target - target_mean) / (target_std + 1e-6)

                recon_loss = mse(output, target_normed)
                recon_losses.append(recon_loss.item())
                loss = recon_loss
            else:
                recon_loss = torch.nan
                recon_losses.append(recon_loss)
                loss = 0

            # contrastive loss
            if use_contrastive_loss:
                # encode the decoder patches
                encoder_out2 = model(func, encoder_mask=decoder_mask, device=device)
                enc_cls_token2 = encoder_out2[:,:1,:]
                
                temp = contrastive_temps[epoch]

                all_cls = torch.cat([enc_cls_token, enc_cls_token2], dim=0)
                # import pdb; pdb.set_trace()
                
                logits = (nn.functional.normalize(all_cls.flatten(1),dim=-1) @
                            nn.functional.normalize(all_cls.flatten(1),dim=-1).T) / temp

                labels = torch.diag_embed(
                    torch.ones(logits.shape[0] // 2), offset=logits.shape[0] // 2
                ) + torch.diag_embed(torch.ones(logits.shape[0] // 2), offset=-logits.shape[0] // 2)
                labels = labels.to(device)
                
                mask = torch.ones_like(logits).bool()
                torch.diagonal(mask).fill_(False)
                
                labels = labels[mask].reshape(logits.shape[0], logits.shape[0]-1)
                logits = logits[mask].reshape(*labels.shape)

                contr_loss = -(logits.log_softmax(-1) * labels).sum(-1).mean()
                
                # logits = (nn.functional.normalize(model.cont(encoder_out.flatten(1)),dim=-1) @
                #             nn.functional.normalize(model.cont(encoder_out2.flatten(1)),dim=-1).T) / temp
                
                # labels = torch.arange(len(logits)).long().to(device)
                # loss1 = crossentropy(logits, labels)
                # loss2 = crossentropy(logits.T, labels)
                # contr_loss = (loss1 + loss2)/2
                
                contrastive_losses.append(contr_loss.item())
                loss += (contr_loss * contrastive_loss_weight)

            cos_sim_encoder_output_patchwise.append(utils.patchwise_cosine_similarity(encoder_out).mean().item())
            cos_sim_encoder_output.append(utils.batchwise_cosine_similarity(encoder_out.flatten(1)/1e3,encoder_out.flatten(1)/1e3)[~torch.eye(len(encoder_out),dtype=torch.bool)].mean().item())
            if use_decoder:
                cos_sim_decoder_output.append(utils.batchwise_cosine_similarity(output,output)[~torch.eye(len(output),dtype=torch.bool)].mean().item())

            loss.backward()
            print(f'Cont loss: {loss.item():.4f}', end='\r')
            optimizer.step()
            lrs.append(optimizer.param_groups[0]["lr"])
            if lr_scheduler is not None:
                lr_scheduler.step()
            train_losses.append(loss.item())

            if train_i >= (num_iterations_per_epoch-1):
                break

        # reset linear_probe
        # if use_cls_token:
        #     linear_probe = LinearProbe((num_patches_per_timepoint+1)*model.encoder_embed_dim)
        # else:
        #     linear_probe = LinearProbe(num_patches_per_timepoint*model.encoder_embed_dim)
        linear_probe = LinearProbe(model.encoder_embed_dim)
        linear_probe = linear_probe.to(device)
        probe_opt_grouped_parameters = [
            {'params': [p for n, p in linear_probe.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 1e-2},
            {'params': [p for n, p in linear_probe.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0},
        ]
        probe_optimizer = torch.optim.AdamW(probe_opt_grouped_parameters, lr=3e-4)
        probe_scheduler = torch.optim.lr_scheduler.OneCycleLR(
            probe_optimizer,
            max_lr=3e-5,
            total_steps=probe_num_iterations_per_epoch
        )

        if True:#(epoch % 5 == 0) or (epoch == num_epochs-1):
            model.eval()
            # optimizer.eval()
            linear_probe.train()
            for probe_i, batch in enumerate(train_dl):
                probe_optimizer.zero_grad()

                input_func = batch['func.npy']

                subject_id = batch['subject_id.txt']
                subject_id = torch.Tensor([int(subject[-2:]) for subject in subject_id]).long()
                subject_id = torch.repeat_interleave(subject_id.long(), 2).to(device)

                func, _ = aug_transform(input_func)
                func = func.reshape(-1, num_frames, 
                                    func.shape[-3], func.shape[-2], func.shape[-1])
                func = func.unsqueeze(1).float().to(device).clamp(0,1)

                encoder_mask = torch.ones(num_patches_per_timepoint).to(torch.bool)
                encoder_mask = encoder_mask.tile(num_frames//frame_patch_size)

                # encode the tube patches
                with torch.no_grad():
                    encoder_out = model(func, encoder_mask=encoder_mask, device=device)
                    encoder_cls = encoder_out[:,0,:]
                    # encoder_out = nn.functional.normalize(encoder_out,dim=-1)

                # linear probe
                subject_pred = linear_probe(encoder_cls)
                probe_loss = crossentropy(subject_pred, subject_id-1) # minus 1 because subject_id is 1-indexed

                probe_accuracy = (torch.max(subject_pred,1).indices == (subject_id-1)).sum() / len(subject_id)
                probe_accs.append(probe_accuracy.item())
                probe_losses.append(probe_loss.item())

                print(probe_i, probe_accuracy.item(), probe_loss.item())

                probe_loss.backward()
                probe_optimizer.step()
                probe_scheduler.step()

                if probe_i >= (probe_num_iterations_per_epoch-1):
                    break

            for test_i, batch in enumerate(test_dl):
                input_func = batch['func.npy']

                subject_id = batch['subject_id.txt']
                subject_id = torch.Tensor([int(subject[-2:]) for subject in subject_id]).long()
                subject_id = torch.repeat_interleave(subject_id.long(), 2).to(device)

                func, _ = aug_transform(input_func)
                func = func.reshape(-1, num_frames, 
                                    func.shape[-3], func.shape[-2], func.shape[-1])
                func = func.unsqueeze(1).float().to(device).clamp(0,1)

                encoder_mask = torch.ones(num_patches_per_timepoint).to(torch.bool)
                encoder_mask = encoder_mask.tile(num_frames//frame_patch_size)

                # encode the tube patches
                with torch.no_grad():
                    encoder_out = model(func, encoder_mask=encoder_mask, device=device)
                    encoder_cls = encoder_out[:,0,:]
                    # encoder_out = nn.functional.normalize(encoder_out,dim=-1)

                # linear probe
                subject_pred = linear_probe(encoder_cls)
                test_loss = crossentropy(subject_pred, subject_id-1) # minus 1 because subject_id is 1-indexed

                test_accuracy = (torch.max(subject_pred,1).indices == (subject_id-1)).sum() / len(subject_id)
                test_accs.append(test_accuracy.item())
                test_losses.append(test_loss.item())

                print("test", test_i, test_accuracy.item(), test_loss.item())

                if test_i >= 1:
                    break

        logs = {
            "train/loss": np.mean(train_losses[-(train_i + 1) :]),
            "train/recon_losses": np.mean(recon_losses[-(train_i + 1) :]),
            "train/contrastive_losses": np.mean(contrastive_losses[-(train_i + 1) :]),
            "train/num_steps": len(recon_losses),
            "train/cos_sim_encoder_output": np.mean(cos_sim_encoder_output[-(train_i + 1) :]),
            "train/cos_sim_decoder_output": np.mean(cos_sim_decoder_output[-(train_i + 1) :]) if use_decoder else np.nan,
            "train/cos_sim_encoder_output_patchwise": np.mean(cos_sim_encoder_output_patchwise[-(train_i + 1) :]),
            "train/probe_losses": np.mean(probe_losses[-(probe_i + 1) :]),
            "train/probe_accs": np.mean(probe_accs[-(probe_i + 1) :]),
            "test/probe_losses": np.mean(test_losses[-(test_i + 1) :]),
            "test/probe_accs": np.mean(test_accs[-(test_i + 1) :]),
            "lr": np.mean(lrs[-(train_i + 1) :]),
            "epoch": epoch,
            "tube_mask_ratio": tube_mask_ratio,
            "decoder_mask_ratio": decoder_mask_ratio,
        }
        progress_bar.set_postfix(**logs)
        if utils.is_interactive(): print(logs)

        # Plot progress (first sample in batch)
        with torch.no_grad():
            if use_decoder and (utils.is_interactive() or wandb_log):
                if epoch % 50 == 0:
                    output = (output * target_std) + target_mean
                    idx = 0
                    
                    decode_vis = torch.zeros_like(target_patches_vit)
                    decode_vis[:, decoder_mask] = output.to(decode_vis.device).to(decode_vis.dtype)
                    decoder_unpatches = rearrange(
                        decode_vis,
                        "b (f d h w) c -> b f d h w c",
                        d=img_size[0]//patch_depth,
                        h=img_size[1]//patch_height,
                        w=img_size[2]//patch_width,
                    )
                    decoder_func = rearrange(
                        decoder_unpatches,
                        "b f d h w (pd ph pw pf c) -> b c (f pf) (d pd) (h ph) (w pw)",
                        b=batch_size*2,
                        f=num_frames//frame_patch_size,
                        d=img_size[0]//patch_depth,
                        h=img_size[1]//patch_height,
                        w=img_size[2]//patch_width,
                        pd=patch_depth,
                        ph=patch_height,
                        pw=patch_width,
                        pf=frame_patch_size,
                    )
                    orig_image = utils.reshape_to_2d(func[idx])
                    recon_image = utils.reshape_to_2d(decoder_func[idx])

                    combined_image = orig_image.clone()
                    combined_image[recon_image!=0] = recon_image[recon_image!=0]

                    random_start = np.arange(3100,3450)
                    orig_image = transforms.ToPILImage()(orig_image[:,random_start])
                    recon_image = transforms.ToPILImage()(recon_image[:,random_start])
                    combined_image = transforms.ToPILImage()(combined_image[:,random_start])

                    if wandb_log:
                        logs[f"train/orig"] = wandb.Image(orig_image, caption=f"epoch{epoch:03d}")
                        logs[f"train/recon"] = wandb.Image(recon_image, caption=f"epoch{epoch:03d}")
                        logs[f"train/combined"] = wandb.Image(combined_image, caption=f"epoch{epoch:03d}")
                    else:
                        if epoch==0:
                            print("orig_image")
                            display(orig_image)
                            print("recon_image")
                            display(recon_image)
                            print("combined_image")
                        display(combined_image)

    if wandb_log: wandb.log(logs)

    # Save model checkpoint
    if (ckpt_saving) and ((epoch % ckpt_interval == 0) or (epoch==num_epochs-1)):
        save_ckpt(model,"last")

    # wait for other GPUs to catch up if needed
    if distributed: dist.barrier()
    torch.cuda.empty_cache()
        
if distributed:
    dist.destroy_process_group()

Number of available CUDA devices: 1
LOCAL RANK=0
NUM GPUS=1
NODE=0
GLOBAL RANK=0
WORLD_SIZE=1
PID of this process = 3543417
device = cuda distributed = False num_devices = 1 local rank = 0 world size = 1 data_type = torch.float32
{'model_name': 'base_nomae_32bs_adam', 'use_cls_token': True, 'use_contrastive_loss': True, 'use_decoder': False, 'contrastive_loss_weight': 1.0, 'batch_size': 32, 'num_workers': 10, 'num_epochs': 400, 'seed': 42, 'max_lr': 0.0003, 'num_samples_per_epoch': 1024, 'test_num_samples_per_epoch': 384, 'ckpt_saving': True, 'ckpt_interval': 50, 'resume_from_ckpt': True, 'wandb_log': True, 'tube_start_masking_ratio': 0.75, 'tube_end_masking_ratio': 0.75, 'decoder_mask_ratio': 0.75, 'patch_size': [8, 8, 8], 'frame_patch_size': 4, 'use_rope_emb': False, 'masking_strategy': 'None', 'encoder_model': 'vit_base', 'decoder_model': 'none', 'img_size': [88, 104, 72], 'num_frames': 4, 'is_s3': False, 'train_urls': ['/weka/proj-fmri/shared/NSD_MNI_wds/{000000..000699}.tar'], 'te

[34m[1mwandb[0m: Currently logged in as: [33matom_101[0m. Use [1m`wandb login --relogin`[0m to force relogin


Overall:   0%|                                                                 | 0/400 [00:00<?, ?it/s]

0 0.40625 2.171462297439575
1 0.4375 2.1890692710876465
2 0.25 2.34234356880188
3 0.15625 2.0172922611236572
4 0.375 1.8765537738800049
5 0.1875 2.168997049331665
6 0.1875 2.0623626708984375
7 0.09375 2.4891469478607178
8 0.125 2.3317131996154785
9 0.375 1.9764039516448975
10 0.3125 2.011580228805542
11 0.25 1.9407715797424316
test 0 0.28125 2.166940689086914


Overall:   0%| | 1/400 [07:26<49:32:16, 446.96s/it, decoder_mask_ratio=0.75, epoch=0, lr=1.43e-5, test/

test 1 0.25 2.254328966140747
{'train/loss': 4.8803403079509735, 'train/recon_losses': nan, 'train/contrastive_losses': 4.8803403079509735, 'train/num_steps': 32, 'train/cos_sim_encoder_output': 0.8098887130618095, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.6359830163419247, 'train/probe_losses': 2.131474733352661, 'train/probe_accs': 0.2630208333333333, 'test/probe_losses': 2.2106348276138306, 'test/probe_accs': 0.265625, 'lr': 1.4262106821979437e-05, 'epoch': 0, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.53125 1.7801884412765503
1 0.5625 1.806705355644226
2 0.5625 1.590114951133728
3 0.46875 1.5834414958953857
4 0.5 1.405739665031433
5 0.25 1.821511149406433
6 0.3125 1.5903902053833008
7 0.5625 1.3203375339508057
8 0.4375 1.4621493816375732
9 0.46875 1.872227668762207
10 0.34375 2.275847911834717
11 0.21875 2.838721513748169
test 0 0.21875 2.8589582443237305
test 1 0.125 2.855853796005249


Overall:   0%| | 2/400 [15:07<50:19:36, 455.22s/it, decoder_mask_ratio=0.75, epoch=1, lr=2.8e-5, test/p

{'train/loss': 4.738204970955849, 'train/recon_losses': nan, 'train/contrastive_losses': 4.738204970955849, 'train/num_steps': 64, 'train/cos_sim_encoder_output': 0.891691405326128, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.78125542961061, 'train/probe_losses': 1.7789479394753773, 'train/probe_accs': 0.4348958333333333, 'test/probe_losses': 2.8574060201644897, 'test/probe_accs': 0.171875, 'lr': 2.8001182400136273e-05, 'epoch': 1, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2535605430603027
1 0.0 2.337804079055786
2 0.0 2.1744649410247803
3 0.125 2.168320894241333
4 0.1875 2.0258986949920654
5 0.484375 1.918904423713684
6 0.4375 1.8555352687835693
7 0.09375 2.236203193664551
8 0.21875 2.22759747505188
9 0.21875 2.136794328689575
10 0.15625 2.1037685871124268
11 0.125 2.286970853805542
test 0 0.0 2.3104681968688965


Overall:   0%| | 2/400 [22:07<50:19:36, 455.22s/it, decoder_mask_ratio=0.75, epoch=2, lr=5.43e-5, test/

test 1 0.0 2.225217342376709
{'train/loss': 4.607143417000771, 'train/recon_losses': nan, 'train/contrastive_losses': 4.607143417000771, 'train/num_steps': 96, 'train/cos_sim_encoder_output': 0.9103570729494095, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.821470245718956, 'train/probe_losses': 2.143818606932958, 'train/probe_accs': 0.17057291666666666, 'test/probe_losses': 2.2678427696228027, 'test/probe_accs': 0.0, 'lr': 5.434769930422135e-05, 'epoch': 2, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}


Overall:   1%| | 3/400 [22:07<48:25:15, 439.08s/it, decoder_mask_ratio=0.75, epoch=2, lr=5.43e-5, test/

0 0.0 2.240022659301758
1 0.0 2.2437448501586914
2 0.0 2.100266695022583
3 0.1875 1.9837300777435303
4 0.21875 2.089029550552368
5 0.0625 2.132296323776245
6 0.0625 2.09609317779541
7 0.515625 1.9976738691329956
8 0.28125 1.9797132015228271
9 0.5625 1.8117334842681885
10 0.25 1.8811153173446655
11 0.15625 2.2385270595550537
test 0 0.09375 1.6926982402801514


Overall:   1%| | 4/400 [30:26<50:53:10, 462.60s/it, decoder_mask_ratio=0.75, epoch=3, lr=9.07e-5, test/

test 1 0.125 1.6178581714630127
{'train/loss': 4.212903954088688, 'train/recon_losses': nan, 'train/contrastive_losses': 4.212903954088688, 'train/num_steps': 128, 'train/cos_sim_encoder_output': 0.9354336988180876, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.8742097113281488, 'train/probe_losses': 2.06616218884786, 'train/probe_accs': 0.19140625, 'test/probe_losses': 1.655278205871582, 'test/probe_accs': 0.109375, 'lr': 9.070661664058203e-05, 'epoch': 3, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.2130584716796875
1 0.09375 2.2497825622558594
2 0.15625 2.0313267707824707
3 0.390625 1.936642050743103
4 0.28125 1.953561782836914
5 0.375 1.8053874969482422
6 0.21875 2.050131320953369
7 0.21875 1.9026596546173096
8 0.15625 1.9551727771759033
9 0.125 1.9052075147628784
10 0.21875 1.7341456413269043
11 0.0625 2.2920498847961426
test 0 0.0 2.7497270107269287


Overall:   1%| | 4/400 [37:37<50:53:10, 462.60s/it, decoder_mask_ratio=0.75, epoch=4, lr=0.000133, test

test 1 0.0 2.5450377464294434
{'train/loss': 4.4862887263298035, 'train/recon_losses': nan, 'train/contrastive_losses': 4.4862887263298035, 'train/num_steps': 160, 'train/cos_sim_encoder_output': 0.9754134435206652, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9459457024931908, 'train/probe_losses': 2.0024271607398987, 'train/probe_accs': 0.19921875, 'test/probe_losses': 2.647382378578186, 'test/probe_accs': 0.0, 'lr': 0.00013349670666361975, 'epoch': 4, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}


Overall:   1%| | 5/400 [37:37<49:30:19, 451.19s/it, decoder_mask_ratio=0.75, epoch=4, lr=0.000133, test

0 0.0 2.097470760345459
1 0.125 2.0390849113464355
2 0.21875 1.977622151374817
3 0.3125 1.8147311210632324
4 0.125 2.242213487625122
5 0.40625 1.8541918992996216
6 0.125 2.2938120365142822
7 0.0625 2.2281816005706787
8 0.40625 1.9525691270828247
9 0.3125 2.069281816482544
10 0.1875 2.2318596839904785
11 0.125 2.085010528564453
test 0 0.625 1.6917496919631958
test 1 0.375 2.1546943187713623


Overall:   2%| | 6/400 [44:02<46:54:08, 428.55s/it, decoder_mask_ratio=0.75, epoch=5, lr=0.000179, test

{'train/loss': 4.365065790712833, 'train/recon_losses': nan, 'train/contrastive_losses': 4.365065790712833, 'train/num_steps': 192, 'train/cos_sim_encoder_output': 0.9782969448715448, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9453140366822481, 'train/probe_losses': 2.0738357603549957, 'train/probe_accs': 0.20052083333333334, 'test/probe_losses': 1.923222005367279, 'test/probe_accs': 0.5, 'lr': 0.0001785032933363802, 'epoch': 5, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.21875 2.029076099395752
1 0.34375 1.8816025257110596
2 0.34375 1.8523893356323242
3 0.21875 1.934616208076477
4 0.1875 2.186103343963623
5 0.03125 2.1770377159118652
6 0.28125 1.8604516983032227
7 0.421875 1.894477128982544
8 0.25 2.06679368019104
9 0.1875 2.033475637435913
10 0.40625 1.8768653869628906
11 0.21875 2.13458514213562
test 0 0.34375 2.023177146911621
test 1 0.21875 2.0806493759155273


Overall:   2%| | 7/400 [52:52<50:25:27, 461.90s/it, decoder_mask_ratio=0.75, epoch=6, lr=0.000221, test

{'train/loss': 4.294267676770687, 'train/recon_losses': nan, 'train/contrastive_losses': 4.294267676770687, 'train/num_steps': 224, 'train/cos_sim_encoder_output': 0.9794473312795162, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9584855977445841, 'train/probe_losses': 1.9939561585585277, 'train/probe_accs': 0.2591145833333333, 'test/probe_losses': 2.051913261413574, 'test/probe_accs': 0.28125, 'lr': 0.00022129338335941795, 'epoch': 6, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2156901359558105
1 0.0 2.140716791152954
2 0.0 2.025994300842285
3 0.1875 2.029874324798584
4 0.1875 1.7985161542892456
5 0.25 1.7575780153274536
6 0.25 1.6215050220489502
7 0.15625 1.6351618766784668
8 0.0625 2.369542121887207
9 0.21875 2.539128541946411
10 0.15625 2.768519878387451
11 0.125 2.621241807937622
test 0 0.0 2.8447890281677246
test 1 0.0 2.4319355487823486


Overall:   2%| | 8/400 [59:37<48:19:48, 443.85s/it, decoder_mask_ratio=0.75, epoch=7, lr=0.000258, test

{'train/loss': 4.8231073170900345, 'train/recon_losses': nan, 'train/contrastive_losses': 4.8231073170900345, 'train/num_steps': 256, 'train/cos_sim_encoder_output': 0.9926067087799311, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9725668895989656, 'train/probe_losses': 2.12695574760437, 'train/probe_accs': 0.1328125, 'test/probe_losses': 2.6383622884750366, 'test/probe_accs': 0.0, 'lr': 0.00025765230069577864, 'epoch': 7, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.359375 2.0477941036224365
1 0.375 2.0364160537719727
2 0.125 2.0535898208618164
3 0.3125 1.8323661088943481
4 0.375 1.7673838138580322
5 0.40625 1.6575413942337036
6 0.640625 1.567030906677246
7 0.46875 2.004265069961548
8 0.40625 1.7495614290237427
9 0.28125 1.5742990970611572
10 0.0625 1.7900863885879517
11 0.03125 1.6854301691055298
test 0 0.0 2.434046983718872
test 1 0.0 2.7039601802825928


Overall:   2%| | 9/400 [1:05:55<45:57:48, 423.19s/it, decoder_mask_ratio=0.75, epoch=8, lr=0.000284, te

{'train/loss': 4.541162803769112, 'train/recon_losses': nan, 'train/contrastive_losses': 4.541162803769112, 'train/num_steps': 288, 'train/cos_sim_encoder_output': 0.9812384489923716, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.948988139629364, 'train/probe_losses': 1.8138136963049571, 'train/probe_accs': 0.3203125, 'test/probe_losses': 2.5690035820007324, 'test/probe_accs': 0.0, 'lr': 0.0002839988175998637, 'epoch': 8, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.2851011753082275
1 0.125 2.1954774856567383
2 0.03125 2.2799482345581055
3 0.0625 2.0269088745117188
4 0.25 2.10931134223938
5 0.40625 1.8378281593322754
6 0.28125 2.035501718521118
7 0.34375 1.7976198196411133
8 0.1875 2.087782382965088
9 0.21875 2.2079615592956543
10 0.0625 2.2489242553710938
11 0.1875 2.158040761947632
test 0 0.0 2.3208272457122803


Overall:   2%| | 10/400 [1:14:31<48:56:09, 451.72s/it, decoder_mask_ratio=0.75, epoch=9, lr=0.000298, t

test 1 0.21875 2.0644659996032715
{'train/loss': 4.785091236233711, 'train/recon_losses': nan, 'train/contrastive_losses': 4.785091236233711, 'train/num_steps': 320, 'train/cos_sim_encoder_output': 0.9838101547211409, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9438603594899178, 'train/probe_losses': 2.1058671474456787, 'train/probe_accs': 0.18489583333333334, 'test/probe_losses': 2.192646622657776, 'test/probe_accs': 0.109375, 'lr': 0.00029773789317802054, 'epoch': 9, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.043250560760498
1 0.125 2.063730001449585
2 0.125 1.9198827743530273
3 0.34375 1.766335129737854
4 0.125 2.023534059524536
5 0.21875 2.290461540222168
6 0.03125 2.408841133117676
7 0.0 2.295419216156006
8 0.09375 2.211548089981079
9 0.0 2.4405112266540527
10 0.0 2.643171548843384
11 0.0 2.8640997409820557
test 0 0.0 2.2104275226593018
test 1 0.0 2.462897539138794


Overall:   3%| | 11/400 [1:22:40<50:03:35, 463.28s/it, decoder_mask_ratio=0.75, epoch=10, lr=0.0003, te

{'train/loss': 4.84112049639225, 'train/recon_losses': nan, 'train/contrastive_losses': 4.84112049639225, 'train/num_steps': 352, 'train/cos_sim_encoder_output': 0.9943470675498247, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9822605326771736, 'train/probe_losses': 2.247565418481827, 'train/probe_accs': 0.1015625, 'test/probe_losses': 2.336662530899048, 'test/probe_accs': 0.0, 'lr': 0.00029999830101689777, 'epoch': 10, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.198155641555786
1 0.0 2.162228584289551
2 0.375 1.9860769510269165
3 0.21875 2.0758626461029053
4 0.28125 2.147369384765625
5 0.15625 2.162106990814209
6 0.0 2.311218023300171
7 0.15625 2.0704851150512695
8 0.15625 1.9821897745132446
9 0.03125 1.9346354007720947
10 0.1875 1.7862781286239624
11 0.1875 2.064256429672241
test 0 0.3125 1.829947829246521
test 1 0.40625 1.7819873094558716


Overall:   3%| | 12/400 [1:30:59<51:05:35, 474.06s/it, decoder_mask_ratio=0.75, epoch=11, lr=0.0003, te

{'train/loss': 4.839182496070862, 'train/recon_losses': nan, 'train/contrastive_losses': 4.839182496070862, 'train/num_steps': 384, 'train/cos_sim_encoder_output': 0.9951165448874235, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9830837342888117, 'train/probe_losses': 2.073405255873998, 'train/probe_accs': 0.14583333333333334, 'test/probe_losses': 1.8059675693511963, 'test/probe_accs': 0.359375, 'lr': 0.00029998841615493663, 'epoch': 11, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.21875 2.1301586627960205
1 0.125 2.1534478664398193
2 0.125 2.081683874130249
3 0.34375 1.941015362739563
4 0.4375 2.0430073738098145
5 0.4375 1.8948291540145874
6 0.34375 1.9639537334442139
7 0.25 2.025745153427124
8 0.125 2.203176736831665
9 0.25 2.0227677822113037
10 0.375 1.8364875316619873
11 0.21875 1.9305020570755005
test 0 0.0 2.5630197525024414


Overall:   3%| | 13/400 [1:40:33<54:14:01, 504.50s/it, decoder_mask_ratio=0.75, epoch=12, lr=0.0003, te

test 1 0.0 2.72424578666687
{'train/loss': 4.785556748509407, 'train/recon_losses': nan, 'train/contrastive_losses': 4.785556748509407, 'train/num_steps': 416, 'train/cos_sim_encoder_output': 0.995446627959609, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9862664341926575, 'train/probe_losses': 2.018897940715154, 'train/probe_accs': 0.2708333333333333, 'test/probe_losses': 2.6436327695846558, 'test/probe_accs': 0.0, 'lr': 0.0002999687991489244, 'epoch': 12, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 2.124890089035034
1 0.125 2.3829874992370605
2 0.28125 1.9820443391799927
3 0.28125 1.8024492263793945
4 0.53125 1.7141973972320557
5 0.53125 1.6161807775497437
6 0.5 1.5316840410232544
7 0.5 1.879585862159729
8 0.59375 1.8557695150375366
9 0.5625 1.8579468727111816
10 0.59375 1.8761063814163208
11 0.65625 1.6577630043029785
test 0 0.625 1.6946824789047241
test 1 0.609375 1.7540535926818848


Overall:   4%| | 14/400 [1:47:37<51:29:15, 480.20s/it, decoder_mask_ratio=0.75, epoch=13, lr=0.0003, te

{'train/loss': 4.773969680070877, 'train/recon_losses': nan, 'train/contrastive_losses': 4.773969680070877, 'train/num_steps': 448, 'train/cos_sim_encoder_output': 0.9850225299596786, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9631617739796638, 'train/probe_losses': 1.856800417105357, 'train/probe_accs': 0.4557291666666667, 'test/probe_losses': 1.7243680357933044, 'test/probe_accs': 0.6171875, 'lr': 0.0002999394512717804, 'epoch': 13, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.375 1.9128342866897583
1 0.46875 1.7897125482559204
2 0.03125 2.1476211547851562
3 0.1875 1.7078502178192139
4 0.5 1.5790406465530396
5 0.1875 2.2995142936706543
6 0.125 2.4792866706848145
7 0.0625 2.827742099761963
8 0.0625 2.5117170810699463
9 0.09375 2.6929588317871094
10 0.0 2.658358097076416
11 0.03125 2.849177360534668
test 0 0.0 2.507863759994507


Overall:   4%| | 14/400 [1:52:25<51:29:15, 480.20s/it, decoder_mask_ratio=0.75, epoch=14, lr=0.0003, te

test 1 0.21875 2.3176698684692383


Overall:   4%| | 15/400 [1:52:25<45:09:04, 422.19s/it, decoder_mask_ratio=0.75, epoch=14, lr=0.0003, te

{'train/loss': 4.45528544485569, 'train/recon_losses': nan, 'train/contrastive_losses': 4.45528544485569, 'train/num_steps': 480, 'train/cos_sim_encoder_output': 0.990606402978301, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9782722238451242, 'train/probe_losses': 2.287984440724055, 'train/probe_accs': 0.17708333333333334, 'test/probe_losses': 2.4127668142318726, 'test/probe_accs': 0.109375, 'lr': 0.00029990037442784645, 'epoch': 14, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.8507139682769775
1 0.0 1.932638168334961
2 0.6875 1.7975537776947021
3 0.375 1.7800564765930176
4 0.21875 2.258817195892334
5 0.46875 1.7262678146362305
6 0.21875 2.3432838916778564
7 0.3125 2.0247960090637207
8 0.5 1.5928945541381836
9 0.46875 1.8935500383377075
10 0.3125 2.157095193862915
11 0.25 2.2109222412109375
test 0 0.1875 2.748546838760376
test 1 0.34375 2.3603267669677734


Overall:   4%| | 16/400 [2:00:37<47:16:56, 443.27s/it, decoder_mask_ratio=0.75, epoch=15, lr=0.0003, te

{'train/loss': 4.449665904045105, 'train/recon_losses': nan, 'train/contrastive_losses': 4.449665904045105, 'train/num_steps': 512, 'train/cos_sim_encoder_output': 0.9915024694055319, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9870041366666555, 'train/probe_losses': 1.964049110809962, 'train/probe_accs': 0.3177083333333333, 'test/probe_losses': 2.5544368028640747, 'test/probe_accs': 0.265625, 'lr': 0.0002998515711527629, 'epoch': 15, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2031397819519043
1 0.0 2.1282708644866943
2 0.3125 2.006335973739624
3 0.25 1.877398133277893
4 0.21875 1.831699252128601
5 0.40625 1.6240160465240479
6 0.28125 1.5632851123809814
7 0.390625 1.3284611701965332
8 0.3125 2.0030229091644287
9 0.3125 2.381582498550415
10 0.46875 1.9271172285079956
11 0.34375 2.077949285507202
test 0 0.0 2.5740206241607666
test 1 0.03125 2.485369920730591


Overall:   4%| | 17/400 [2:09:03<49:09:08, 462.01s/it, decoder_mask_ratio=0.75, epoch=16, lr=0.0003, te

{'train/loss': 4.2565952911973, 'train/recon_losses': nan, 'train/contrastive_losses': 4.2565952911973, 'train/num_steps': 544, 'train/cos_sim_encoder_output': 0.9876903146505356, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9850409906357527, 'train/probe_losses': 1.91268985470136, 'train/probe_accs': 0.2747395833333333, 'test/probe_losses': 2.5296952724456787, 'test/probe_accs': 0.015625, 'lr': 0.00029979304461330423, 'epoch': 16, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.25361967086792
1 0.15625 2.0440890789031982
2 0.21875 1.9667648077011108
3 0.15625 1.9045813083648682
4 0.125 1.929797887802124
5 0.03125 1.9553027153015137
6 0.1875 1.7843608856201172
7 0.46875 1.901329517364502
8 0.25 2.3039286136627197
9 0.28125 2.3781659603118896
10 0.34375 2.2262415885925293
11 0.28125 2.3915441036224365
test 0 0.0 2.8685805797576904
test 1 0.0 2.745845079421997


Overall:   4%| | 18/400 [2:15:15<46:08:19, 434.82s/it, decoder_mask_ratio=0.75, epoch=17, lr=0.0003, te

{'train/loss': 4.2111397087574005, 'train/recon_losses': nan, 'train/contrastive_losses': 4.2111397087574005, 'train/num_steps': 576, 'train/cos_sim_encoder_output': 0.987125588580966, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9794106464833021, 'train/probe_losses': 2.0866438448429108, 'train/probe_accs': 0.22135416666666666, 'test/probe_losses': 2.8072128295898438, 'test/probe_accs': 0.0, 'lr': 0.0002997247986071739, 'epoch': 17, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 2.0546374320983887
1 0.15625 2.074091911315918
2 0.25 1.8259129524230957
3 0.46875 1.62313973903656
4 0.34375 1.9049957990646362
5 0.1875 2.1050031185150146
6 0.0625 2.3196160793304443
7 0.0625 2.3392913341522217
8 0.09375 2.503237247467041
9 0.0625 2.4480233192443848
10 0.0 2.407748222351074
11 0.0625 1.8984969854354858
test 0 0.0 1.91921865940094
test 1 0.0 2.112938165664673


Overall:   5%| | 19/400 [2:22:19<45:41:06, 431.67s/it, decoder_mask_ratio=0.75, epoch=18, lr=0.0003, te

{'train/loss': 4.183203771710396, 'train/recon_losses': nan, 'train/contrastive_losses': 4.183203771710396, 'train/num_steps': 608, 'train/cos_sim_encoder_output': 0.984475452452898, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9816086478531361, 'train/probe_losses': 2.1253495117028556, 'train/probe_accs': 0.16666666666666666, 'test/probe_losses': 2.0160784125328064, 'test/probe_accs': 0.0, 'lr': 0.0002996468375627573, 'epoch': 18, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 2.0619864463806152
1 0.3125 1.9870030879974365
2 0.3125 1.907774806022644
3 0.21875 1.903266429901123
4 0.1875 1.8155304193496704
5 0.21875 2.192409038543701
6 0.40625 1.83955979347229
7 0.78125 1.5423682928085327
8 0.5 1.7265595197677612
9 0.5625 1.6107964515686035
10 0.265625 2.139040946960449
11 0.28125 1.8543145656585693
test 0 0.515625 1.5263808965682983
test 1 0.25 1.8645122051239014


Overall:   5%| | 20/400 [2:30:21<47:09:54, 446.83s/it, decoder_mask_ratio=0.75, epoch=19, lr=0.0003, te

{'train/loss': 4.16776355355978, 'train/recon_losses': nan, 'train/contrastive_losses': 4.16776355355978, 'train/num_steps': 640, 'train/cos_sim_encoder_output': 0.984997259452939, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9889219831675291, 'train/probe_losses': 1.8817174832026164, 'train/probe_accs': 0.36328125, 'test/probe_losses': 1.6954465508460999, 'test/probe_accs': 0.3828125, 'lr': 0.00029955916653883494, 'epoch': 19, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.056802272796631
1 0.25 1.9794470071792603
2 0.09375 1.9358779191970825
3 0.375 1.9197797775268555
4 0.375 1.8614866733551025
5 0.21875 1.9264568090438843
6 0.09375 2.048452377319336
7 0.09375 2.2366669178009033
8 0.25 1.8689414262771606
9 0.15625 2.3061485290527344
10 0.28125 1.750924825668335
11 0.21875 2.083221912384033
test 0 0.28125 1.8180577754974365
test 1 0.25 1.6836103200912476


Overall:   5%| | 21/400 [2:37:04<45:38:26, 433.53s/it, decoder_mask_ratio=0.75, epoch=20, lr=0.000299, 

{'train/loss': 4.1234301552176476, 'train/recon_losses': nan, 'train/contrastive_losses': 4.1234301552176476, 'train/num_steps': 672, 'train/cos_sim_encoder_output': 0.9823487121611834, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9917970784008503, 'train/probe_losses': 1.9978505373001099, 'train/probe_accs': 0.21354166666666666, 'test/probe_losses': 1.750834047794342, 'test/probe_accs': 0.265625, 'lr': 0.00029946179122425387, 'epoch': 20, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.34375 2.086493730545044
1 0.21875 2.19018816947937
2 0.03125 2.1736958026885986
3 0.25 1.9761362075805664
4 0.21875 2.00575852394104
5 0.0625 2.2389159202575684
6 0.0625 2.207885980606079
7 0.03125 1.9615247249603271
8 0.0625 1.9046858549118042
9 0.25 1.7412779331207275
10 0.125 2.2524423599243164
11 0.21875 2.465214729309082
test 0 0.28125 1.875416874885559
test 1 0.15625 1.8483161926269531


Overall:   6%| | 22/400 [2:42:30<42:09:21, 401.49s/it, decoder_mask_ratio=0.75, epoch=21, lr=0.000299, 

{'train/loss': 4.079931169748306, 'train/recon_losses': nan, 'train/contrastive_losses': 4.079931169748306, 'train/num_steps': 704, 'train/cos_sim_encoder_output': 0.9793044906109571, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9920373018831015, 'train/probe_losses': 2.1003516614437103, 'train/probe_accs': 0.15625, 'test/probe_losses': 1.861866533756256, 'test/probe_accs': 0.21875, 'lr': 0.00029935471793755873, 'epoch': 21, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.34375 1.9438836574554443
1 0.25 1.9759750366210938
2 0.0625 2.081589698791504
3 0.15625 2.027409791946411
4 0.3125 1.8484671115875244
5 0.34375 1.8159667253494263
6 0.1875 1.9957332611083984
7 0.28125 1.7988361120224
8 0.0625 1.9616395235061646
9 0.09375 1.857356309890747
10 0.140625 1.8424067497253418
11 0.21875 1.8456690311431885
test 0 0.09375 2.0118472576141357
test 1 0.0625 2.0021963119506836


Overall:   6%| | 23/400 [2:47:10<38:13:48, 365.06s/it, decoder_mask_ratio=0.75, epoch=22, lr=0.000299, 

{'train/loss': 4.045125998556614, 'train/recon_losses': nan, 'train/contrastive_losses': 4.045125998556614, 'train/num_steps': 736, 'train/cos_sim_encoder_output': 0.9795737341046333, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9919315576553345, 'train/probe_losses': 1.9162444174289703, 'train/probe_accs': 0.20442708333333334, 'test/probe_losses': 2.0070217847824097, 'test/probe_accs': 0.078125, 'lr': 0.0002992379536265817, 'epoch': 22, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.2464256286621094
1 0.1875 2.2854230403900146
2 0.15625 2.2838997840881348
3 0.125 1.9704124927520752
4 0.375 1.7457183599472046
5 0.0625 1.9832086563110352
6 0.453125 1.8495502471923828
7 0.40625 1.7315831184387207
8 0.3125 2.1688835620880127
9 0.15625 2.3901236057281494
10 0.0625 2.5624003410339355
11 0.09375 2.6355831623077393
test 0 0.0 2.2995898723602295
test 1 0.0 2.5963971614837646


Overall:   6%| | 24/400 [2:55:13<41:47:49, 400.18s/it, decoder_mask_ratio=0.75, epoch=23, lr=0.000299, 

{'train/loss': 4.023701265454292, 'train/recon_losses': nan, 'train/contrastive_losses': 4.023701265454292, 'train/num_steps': 768, 'train/cos_sim_encoder_output': 0.9812890812754631, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9919873904436827, 'train/probe_losses': 2.1544343332449594, 'train/probe_accs': 0.21223958333333334, 'test/probe_losses': 2.447993516921997, 'test/probe_accs': 0.0, 'lr': 0.0002991115058679916, 'epoch': 23, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.067154884338379
1 0.09375 2.2626559734344482
2 0.1875 2.115669012069702
3 0.125 1.9761366844177246
4 0.28125 1.7108794450759888
5 0.21875 1.741477131843567
6 0.1875 2.1785759925842285
7 0.1875 2.062587022781372
8 0.09375 2.1934804916381836
9 0.21875 1.8968610763549805
10 0.625 1.5470473766326904
11 0.46875 1.9471585750579834
test 0 0.09375 2.7294981479644775
test 1 0.3125 2.2653744220733643


Overall:   6%| | 25/400 [3:00:32<39:10:40, 376.11s/it, decoder_mask_ratio=0.75, epoch=24, lr=0.000299, 

{'train/loss': 3.9893718287348747, 'train/recon_losses': nan, 'train/contrastive_losses': 3.9893718287348747, 'train/num_steps': 800, 'train/cos_sim_encoder_output': 0.9814906623214483, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.991666404530406, 'train/probe_losses': 1.9749736388524373, 'train/probe_accs': 0.22916666666666666, 'test/probe_losses': 2.497436285018921, 'test/probe_accs': 0.203125, 'lr': 0.00029897538286680227, 'epoch': 24, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.251375675201416
1 0.0 2.236783742904663
2 0.15625 2.142681837081909
3 0.40625 1.916544795036316
4 0.46875 1.6768964529037476
5 0.3125 1.8971649408340454
6 0.28125 1.9903920888900757
7 0.125 1.9988139867782593
8 0.3125 1.705273151397705
9 0.28125 1.6649893522262573
10 0.21875 2.1861302852630615
11 0.15625 1.9252026081085205
test 0 0.09375 2.1333084106445312
test 1 0.03125 2.255235195159912


Overall:   6%| | 26/400 [3:09:11<43:31:12, 418.91s/it, decoder_mask_ratio=0.75, epoch=25, lr=0.000299, 

{'train/loss': 4.108893394470215, 'train/recon_losses': nan, 'train/contrastive_losses': 4.108893394470215, 'train/num_steps': 832, 'train/cos_sim_encoder_output': 0.9856468290090561, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9920754134654999, 'train/probe_losses': 1.9660207430521648, 'train/probe_accs': 0.2265625, 'test/probe_losses': 2.1942718029022217, 'test/probe_accs': 0.0625, 'lr': 0.0002988295934558401, 'epoch': 25, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.3214166164398193
1 0.4375 2.1287903785705566
2 0.3125 1.99045991897583
3 0.15625 1.7084428071975708
4 0.59375 1.3125081062316895
5 0.4375 1.5150799751281738
6 0.46875 1.6392207145690918
7 0.4375 1.694495439529419
8 0.25 2.032536029815674
9 0.46875 1.564174771308899
10 0.4375 1.6370295286178589
11 0.4375 1.6729532480239868
test 0 0.1875 2.8775687217712402
test 1 0.09375 2.8043150901794434


Overall:   7%| | 27/400 [3:15:30<42:09:27, 406.88s/it, decoder_mask_ratio=0.75, epoch=26, lr=0.000299, 

{'train/loss': 4.101914897561073, 'train/recon_losses': nan, 'train/contrastive_losses': 4.101914897561073, 'train/num_steps': 864, 'train/cos_sim_encoder_output': 0.9881537891924381, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9933376554399729, 'train/probe_losses': 1.7680922945340474, 'train/probe_accs': 0.3932291666666667, 'test/probe_losses': 2.840941905975342, 'test/probe_accs': 0.140625, 'lr': 0.0002986741470951711, 'epoch': 26, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.6426777839660645
1 0.15625 2.5325167179107666
2 0.21875 2.1805169582366943
3 0.0625 2.131619453430176
4 0.65625 1.6598116159439087
5 0.40625 1.7651097774505615
6 0.03125 2.355950355529785
7 0.09375 2.1802003383636475
8 0.15625 2.1867029666900635
9 0.1875 2.0007543563842773
10 0.1875 1.9575051069259644
11 0.0625 2.0059053897857666
test 0 0.0 2.6416499614715576
test 1 0.0 2.5565249919891357


Overall:   7%| | 28/400 [3:21:07<39:53:18, 386.02s/it, decoder_mask_ratio=0.75, epoch=27, lr=0.000299, 

{'train/loss': 4.1153551414608955, 'train/recon_losses': nan, 'train/contrastive_losses': 4.1153551414608955, 'train/num_steps': 896, 'train/cos_sim_encoder_output': 0.9890278149396181, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9942030776292086, 'train/probe_losses': 2.133272568384806, 'train/probe_accs': 0.19270833333333334, 'test/probe_losses': 2.5990874767303467, 'test/probe_accs': 0.0, 'lr': 0.00029850905387148683, 'epoch': 27, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 1.8743189573287964
1 0.40625 1.9542820453643799
2 0.25 1.9610037803649902
3 0.15625 1.9235312938690186
4 0.46875 1.822409749031067
5 0.3125 1.9087032079696655
6 0.375 2.069638967514038
7 0.46875 2.039443254470825
8 0.34375 1.7357739210128784
9 0.46875 1.5680251121520996
10 0.671875 1.4628551006317139
11 0.4375 1.7065807580947876
test 0 0.0 2.6669692993164062
test 1 0.0 2.4588189125061035


Overall:   7%| | 29/400 [3:26:57<38:38:21, 374.94s/it, decoder_mask_ratio=0.75, epoch=28, lr=0.000298, 

{'train/loss': 3.9998236000537872, 'train/recon_losses': nan, 'train/contrastive_losses': 3.9998236000537872, 'train/num_steps': 928, 'train/cos_sim_encoder_output': 0.9850749485194683, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9928141795098782, 'train/probe_losses': 1.8355471789836884, 'train/probe_accs': 0.38671875, 'test/probe_losses': 2.562894105911255, 'test/probe_accs': 0.0, 'lr': 0.00029833432449745005, 'epoch': 28, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.0649704933166504
1 0.0625 2.072444438934326
2 0.34375 2.083153486251831
3 0.21875 1.7572017908096313
4 0.90625 1.2737139463424683
5 0.46875 1.7293376922607422
6 0.375 1.9408645629882812
7 0.15625 2.4218719005584717
8 0.21875 2.2845935821533203
9 0.1875 1.991658091545105
10 0.1875 2.1717631816864014
11 0.1875 1.9534566402435303
test 0 0.0 2.8315272331237793
test 1 0.0625 2.7323920726776123


Overall:   8%| | 30/400 [3:32:31<37:17:06, 362.77s/it, decoder_mask_ratio=0.75, epoch=29, lr=0.000298, 

{'train/loss': 3.963488072156906, 'train/recon_losses': nan, 'train/contrastive_losses': 3.963488072156906, 'train/num_steps': 960, 'train/cos_sim_encoder_output': 0.9823175389319658, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9934988431632519, 'train/probe_losses': 1.9787524839242299, 'train/probe_accs': 0.28125, 'test/probe_losses': 2.781959652900696, 'test/probe_accs': 0.03125, 'lr': 0.00029814997031099935, 'epoch': 29, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2031967639923096
1 0.0 2.1433427333831787
2 0.28125 1.9866303205490112
3 0.59375 1.7701528072357178
4 0.5625 1.5650569200515747
5 0.40625 1.6957412958145142
6 0.0 2.1941628456115723
7 0.25 1.659282922744751
8 0.0625 1.9371140003204346
9 0.28125 1.5253512859344482
10 0.3125 1.4448052644729614
11 0.375 1.7536425590515137
test 0 0.59375 2.0177161693573
test 1 0.4375 2.004335880279541


Overall:   8%| | 31/400 [3:35:21<31:14:50, 304.85s/it, decoder_mask_ratio=0.75, epoch=30, lr=0.000298, 

{'train/loss': 4.005857229232788, 'train/recon_losses': nan, 'train/contrastive_losses': 4.005857229232788, 'train/num_steps': 992, 'train/cos_sim_encoder_output': 0.9847839884459972, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9952130652964115, 'train/probe_losses': 1.823206643263499, 'train/probe_accs': 0.2604166666666667, 'test/probe_losses': 2.0110260248184204, 'test/probe_accs': 0.515625, 'lr': 0.0002979560032746138, 'epoch': 30, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.9872798919677734
1 0.0 2.042022943496704
2 0.09375 1.9556057453155518
3 0.328125 1.8438975811004639
4 0.375 1.6239169836044312
5 0.375 1.5792899131774902
6 0.4375 1.8160221576690674
7 0.59375 1.3307533264160156
8 0.4375 1.638128638267517
9 0.21875 1.5759562253952026
10 0.265625 1.2999985218048096
11 0.5625 1.310678243637085
test 0 0.125 2.134401321411133
test 1 0.03125 1.752031683921814


Overall:   8%| | 32/400 [3:39:56<30:14:41, 295.87s/it, decoder_mask_ratio=0.75, epoch=31, lr=0.000298, 

{'train/loss': 4.054017245769501, 'train/recon_losses': nan, 'train/contrastive_losses': 4.054017245769501, 'train/num_steps': 1024, 'train/cos_sim_encoder_output': 0.9843098241835833, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9955687560141087, 'train/probe_losses': 1.6669625143210094, 'train/probe_accs': 0.3072916666666667, 'test/probe_losses': 1.9432165026664734, 'test/probe_accs': 0.078125, 'lr': 0.00029775243597453634, 'epoch': 31, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.4052295684814453
1 0.21875 2.209399938583374
2 0.15625 1.9542988538742065
3 0.28125 1.8582727909088135
4 0.4375 1.571249008178711
5 0.3125 1.8049232959747314
6 0.34375 1.8028690814971924
7 0.3125 1.955737829208374
8 0.28125 2.041921854019165
9 0.21875 2.166529655456543
10 0.03125 2.718982696533203
11 0.125 2.6639273166656494
test 0 0.1875 2.069110870361328
test 1 0.125 2.1798980236053467


Overall:   8%| | 33/400 [3:43:52<28:20:15, 277.97s/it, decoder_mask_ratio=0.75, epoch=32, lr=0.000298, 

{'train/loss': 3.8888884112238884, 'train/recon_losses': nan, 'train/contrastive_losses': 3.8888884112238884, 'train/num_steps': 1056, 'train/cos_sim_encoder_output': 0.9733269810676575, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9955264404416084, 'train/probe_losses': 2.0961118241151175, 'train/probe_accs': 0.2265625, 'test/probe_losses': 2.1245044469833374, 'test/probe_accs': 0.15625, 'lr': 0.0002975392816199574, 'epoch': 32, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.59375 1.6841390132904053
1 0.25 1.8850516080856323
2 0.4375 1.7345184087753296
3 0.28125 1.6321005821228027
4 0.25 1.609412431716919
5 0.3125 1.5497103929519653
6 0.125 1.903475046157837
7 0.03125 1.69291353225708
8 0.4375 1.9196619987487793
9 0.25 2.098379135131836
10 0.15625 1.825303554534912
11 0.15625 2.0371994972229004
test 0 0.25 1.7012577056884766
test 1 0.34375 1.6112935543060303


Overall:   8%| | 34/400 [3:48:41<28:35:28, 281.23s/it, decoder_mask_ratio=0.75, epoch=33, lr=0.000297, 

{'train/loss': 3.963218852877617, 'train/recon_losses': nan, 'train/contrastive_losses': 3.963218852877617, 'train/num_steps': 1088, 'train/cos_sim_encoder_output': 0.9741984736174345, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9948314316570759, 'train/probe_losses': 1.7976554334163666, 'train/probe_accs': 0.2734375, 'test/probe_losses': 1.6562756299972534, 'test/probe_accs': 0.296875, 'lr': 0.0002973165540421576, 'epoch': 33, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.0129570960998535
1 0.15625 1.9439542293548584
2 0.421875 1.903178095817566
3 0.21875 1.9308699369430542
4 0.21875 1.9297455549240112
5 0.15625 1.9270068407058716
6 0.4375 1.7016096115112305
7 0.3125 2.0911436080932617
8 0.5 1.6267521381378174
9 0.28125 1.9884065389633179
10 0.53125 1.549834966659546
11 0.578125 1.5932141542434692
test 0 0.28125 1.975545048713684
test 1 0.203125 2.2904069423675537


Overall:   9%| | 35/400 [3:52:26<26:49:44, 264.61s/it, decoder_mask_ratio=0.75, epoch=34, lr=0.000297, 

{'train/loss': 3.9942132085561752, 'train/recon_losses': nan, 'train/contrastive_losses': 3.9942132085561752, 'train/num_steps': 1120, 'train/cos_sim_encoder_output': 0.9810165148228407, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9945923946797848, 'train/probe_losses': 1.8498893976211548, 'train/probe_accs': 0.3255208333333333, 'test/probe_losses': 2.132975995540619, 'test/probe_accs': 0.2421875, 'lr': 0.0002970842676936102, 'epoch': 34, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.030752658843994
1 0.0 2.137458086013794
2 0.21875 2.028057813644409
3 0.28125 1.7841085195541382
4 0.375 1.6662341356277466
5 0.28125 1.624739170074463
6 0.28125 1.6881352663040161
7 0.375 1.7294703722000122
8 0.3125 2.005345582962036
9 0.125 2.8627078533172607
10 0.0625 3.127068519592285
11 0.125 2.6940510272979736
test 0 0.0 3.619894027709961
test 1 0.0 3.498558759689331


Overall:   9%| | 36/400 [3:57:54<28:40:04, 283.53s/it, decoder_mask_ratio=0.75, epoch=35, lr=0.000297, 

{'train/loss': 3.9765480235219, 'train/recon_losses': nan, 'train/contrastive_losses': 3.9765480235219, 'train/num_steps': 1152, 'train/cos_sim_encoder_output': 0.9878233801573515, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9960308764129877, 'train/probe_losses': 2.1148440837860107, 'train/probe_accs': 0.203125, 'test/probe_losses': 3.559226393699646, 'test/probe_accs': 0.0, 'lr': 0.00029684243764704357, 'epoch': 35, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.9633488655090332
1 0.03125 1.9777692556381226
2 0.375 1.8476052284240723
3 0.5625 1.5277034044265747
4 0.3125 2.0902161598205566
5 0.21875 2.258668899536133
6 0.15625 1.9331653118133545
7 0.15625 1.674668788909912
8 0.0625 1.7886887788772583
9 0.46875 1.8994075059890747
10 0.53125 1.794785976409912
11 0.5 1.9114593267440796
test 0 0.1875 1.7517461776733398
test 1 0.15625 1.5030277967453003


Overall:   9%| | 37/400 [4:04:54<32:42:30, 324.38s/it, decoder_mask_ratio=0.75, epoch=36, lr=0.000297, 

{'train/loss': 4.0077056139707565, 'train/recon_losses': nan, 'train/contrastive_losses': 4.0077056139707565, 'train/num_steps': 1184, 'train/cos_sim_encoder_output': 0.987518422305584, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9969011172652245, 'train/probe_losses': 1.888957291841507, 'train/probe_accs': 0.28125, 'test/probe_losses': 1.62738698720932, 'test/probe_accs': 0.171875, 'lr': 0.00029659107959446277, 'epoch': 36, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 2.1887238025665283
1 0.375 2.0402257442474365
2 0.15625 1.9929641485214233
3 0.65625 1.6088175773620605
4 0.75 1.2090704441070557
5 0.53125 1.7103937864303589
6 0.40625 1.9266185760498047
7 0.375 1.7937812805175781
8 0.375 1.6233205795288086
9 0.21875 1.805100440979004
10 0.40625 1.2639787197113037
11 0.375 1.4928966760635376
test 0 0.46875 2.0123770236968994
test 1 0.25 2.0478105545043945


Overall:  10%| | 38/400 [4:08:49<29:55:16, 297.56s/it, decoder_mask_ratio=0.75, epoch=37, lr=0.000296, 

{'train/loss': 3.8641301169991493, 'train/recon_losses': nan, 'train/contrastive_losses': 3.8641301169991493, 'train/num_steps': 1216, 'train/cos_sim_encoder_output': 0.9761777650564909, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9967078007757664, 'train/probe_losses': 1.7213243146737416, 'train/probe_accs': 0.3958333333333333, 'test/probe_losses': 2.030093789100647, 'test/probe_accs': 0.359375, 'lr': 0.0002963302098461317, 'epoch': 37, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.1875 2.1655564308166504
1 0.25 2.082017421722412
2 0.15625 2.1352131366729736
3 0.15625 2.0151712894439697
4 0.09375 1.9372950792312622
5 0.46875 1.8099212646484375
6 0.25 1.652871012687683
7 0.40625 1.476584792137146
8 0.78125 1.4990602731704712
9 0.8125 1.4642107486724854
10 0.53125 1.9240880012512207
11 0.53125 1.8910894393920898
test 0 0.5 1.9115763902664185
test 1 0.375 2.114649772644043


Overall:  10%| | 39/400 [4:13:12<28:48:35, 287.30s/it, decoder_mask_ratio=0.75, epoch=38, lr=0.000296, 

{'train/loss': 3.9126035645604134, 'train/recon_losses': nan, 'train/contrastive_losses': 3.9126035645604134, 'train/num_steps': 1248, 'train/cos_sim_encoder_output': 0.9799397382885218, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9954566471278667, 'train/probe_losses': 1.8377565741539001, 'train/probe_accs': 0.3854166666666667, 'test/probe_losses': 2.0131130814552307, 'test/probe_accs': 0.4375, 'lr': 0.0002960598453295145, 'epoch': 38, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.2015466690063477
1 0.34375 2.0329036712646484
2 0.3125 1.9514272212982178
3 0.21875 1.8800475597381592
4 0.25 1.683200478553772
5 0.5 1.792374610900879
6 0.53125 1.5698869228363037
7 0.6875 1.3646222352981567
8 0.5625 1.5951958894729614
9 0.53125 1.6989333629608154
10 0.375 2.2251200675964355
11 0.21875 2.638324499130249
test 0 0.28125 2.337693214416504
test 1 0.125 2.5804600715637207


Overall:  10%| | 40/400 [4:15:50<24:51:30, 248.59s/it, decoder_mask_ratio=0.75, epoch=39, lr=0.000296, 

{'train/loss': 3.963144414126873, 'train/recon_losses': nan, 'train/contrastive_losses': 3.963144414126873, 'train/num_steps': 1280, 'train/cos_sim_encoder_output': 0.9821643009781837, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9948783572763205, 'train/probe_losses': 1.8861319323380787, 'train/probe_accs': 0.4010416666666667, 'test/probe_losses': 2.4590766429901123, 'test/probe_accs': 0.203125, 'lr': 0.0002957800035881772, 'epoch': 39, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.3411285877227783
1 0.28125 2.13741397857666
2 0.21875 1.961128830909729
3 0.5 1.5563602447509766
4 0.53125 1.4151557683944702
5 0.28125 2.050478219985962
6 0.3125 1.811590313911438
7 0.25 2.025437116622925
8 0.4375 1.7513554096221924
9 0.3125 1.909407615661621
10 0.0625 2.334704637527466
11 0.1875 2.1970486640930176
test 0 0.21875 1.542566180229187


Overall:  10%| | 41/400 [4:18:28<22:03:43, 221.24s/it, decoder_mask_ratio=0.75, epoch=40, lr=0.000295, 

test 1 0.1875 1.7050371170043945
{'train/loss': 3.798316851258278, 'train/recon_losses': nan, 'train/contrastive_losses': 3.798316851258278, 'train/num_steps': 1312, 'train/cos_sim_encoder_output': 0.9828016795217991, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9927291367202997, 'train/probe_losses': 1.9576007823149364, 'train/probe_accs': 0.2864583333333333, 'test/probe_losses': 1.6238016486167908, 'test/probe_accs': 0.203125, 'lr': 0.00029549070278064935, 'epoch': 40, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.1875 2.067603588104248
1 0.15625 2.0455057621002197
2 0.125 2.0265212059020996
3 0.46875 1.80195951461792
4 0.8125 1.4275665283203125
5 0.625 1.7102595567703247
6 0.75 1.392694354057312
7 0.5 1.9014315605163574
8 0.53125 1.843510627746582
9 0.53125 1.753877878189087
10 0.4375 1.9124701023101807
11 0.65625 1.4818233251571655
test 0 0.21875 2.170799493789673
test 1 0.28125 2.0602259635925293


Overall:  10%| | 42/400 [4:23:16<24:00:47, 241.47s/it, decoder_mask_ratio=0.75, epoch=41, lr=0.000295, 

{'train/loss': 3.710613250732422, 'train/recon_losses': nan, 'train/contrastive_losses': 3.710613250732422, 'train/num_steps': 1344, 'train/cos_sim_encoder_output': 0.9782536551356316, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9919765274971724, 'train/probe_losses': 1.7804353336493175, 'train/probe_accs': 0.4817708333333333, 'test/probe_losses': 2.115512728691101, 'test/probe_accs': 0.25, 'lr': 0.00029519196167924577, 'epoch': 41, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.4375 2.0568840503692627
1 0.40625 2.023663282394409
2 0.28125 2.0395865440368652
3 0.53125 1.583251953125
4 0.28125 2.019254207611084
5 0.4375 1.6841915845870972
6 0.1875 1.8623626232147217
7 0.15625 1.8258788585662842
8 0.03125 2.237745761871338
9 0.375 1.799804925918579
10 0.4375 1.9761008024215698
11 0.21875 2.3730177879333496
test 0 0.4375 2.267334461212158
test 1 0.4375 1.9958797693252563


Overall:  11%| | 43/400 [4:28:49<26:38:48, 268.71s/it, decoder_mask_ratio=0.75, epoch=42, lr=0.000295, 

{'train/loss': 3.7710741385817528, 'train/recon_losses': nan, 'train/contrastive_losses': 3.7710741385817528, 'train/num_steps': 1376, 'train/cos_sim_encoder_output': 0.9802993405610323, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.992780327796936, 'train/probe_losses': 1.9568118651707966, 'train/probe_accs': 0.3151041666666667, 'test/probe_losses': 2.1316071152687073, 'test/probe_accs': 0.4375, 'lr': 0.00029488379966884835, 'epoch': 42, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 1.9695216417312622
1 0.1875 1.9901371002197266
2 0.0625 2.0286266803741455
3 0.5625 1.5769531726837158
4 0.34375 1.4972964525222778
5 0.125 2.2204675674438477
6 0.34375 2.0084948539733887
7 0.1875 2.506084442138672
8 0.375 2.124586820602417
9 0.21875 2.294398307800293
10 0.15625 2.4504551887512207
11 0.4375 1.9910796880722046
test 0 0.515625 1.8334603309631348
test 1 0.46875 1.948148488998413


Overall:  11%| | 44/400 [4:31:48<23:55:08, 241.88s/it, decoder_mask_ratio=0.75, epoch=43, lr=0.000295, 

{'train/loss': 3.753168970346451, 'train/recon_losses': nan, 'train/contrastive_losses': 3.753168970346451, 'train/num_steps': 1408, 'train/cos_sim_encoder_output': 0.9796550832688808, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9940650928765535, 'train/probe_losses': 2.054841826359431, 'train/probe_accs': 0.2760416666666667, 'test/probe_losses': 1.890804409980774, 'test/probe_accs': 0.4921875, 'lr': 0.00029456623674564843, 'epoch': 43, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 2.068000316619873
1 0.1875 2.180427074432373
2 0.15625 2.097400426864624
3 0.21875 2.0221424102783203
4 0.375 1.840977668762207
5 0.25 1.6948217153549194
6 0.28125 1.667548418045044
7 0.65625 1.8208343982696533
8 0.5 1.9909512996673584
9 0.5 1.9166929721832275
10 0.25 2.0246293544769287
11 0.4375 1.905626893043518
test 0 0.125 2.2135090827941895
test 1 0.09375 2.276860475540161


Overall:  11%| | 45/400 [4:36:52<25:40:52, 260.43s/it, decoder_mask_ratio=0.75, epoch=44, lr=0.000294, 

{'train/loss': 3.7291653156280518, 'train/recon_losses': nan, 'train/contrastive_losses': 3.7291653156280518, 'train/num_steps': 1440, 'train/cos_sim_encoder_output': 0.9764735270291567, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9937242902815342, 'train/probe_losses': 1.935837745666504, 'train/probe_accs': 0.34375, 'test/probe_losses': 2.2451847791671753, 'test/probe_accs': 0.109375, 'lr': 0.000294239293515849, 'epoch': 44, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.262639045715332
1 0.0 2.289445161819458
2 0.03125 2.131866693496704
3 0.3125 1.87971031665802
4 0.21875 1.973813772201538
5 0.28125 1.9672136306762695
6 0.28125 1.9309041500091553
7 0.15625 2.3704123497009277
8 0.34375 2.3888731002807617
9 0.3125 2.27095627784729
10 0.3125 2.156158447265625
11 0.28125 2.0036892890930176
test 0 0.71875 1.8025404214859009
test 1 0.59375 2.1388840675354004


Overall:  12%| | 46/400 [4:41:40<26:26:17, 268.86s/it, decoder_mask_ratio=0.75, epoch=45, lr=0.000294, 

{'train/loss': 3.7324783131480217, 'train/recon_losses': nan, 'train/contrastive_losses': 3.7324783131480217, 'train/num_steps': 1472, 'train/cos_sim_encoder_output': 0.9776104874908924, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9933957364410162, 'train/probe_losses': 2.135473519563675, 'train/probe_accs': 0.2109375, 'test/probe_losses': 1.9707122445106506, 'test/probe_accs': 0.65625, 'lr': 0.00029390299119432764, 'epoch': 45, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.167266607284546
1 0.0 2.1283113956451416
2 0.25 1.9744600057601929
3 0.28125 1.775219202041626
4 0.625 1.5657750368118286
5 0.46875 1.446662425994873
6 0.46875 1.6631748676300049
7 0.5625 1.6939749717712402
8 0.53125 1.6857399940490723
9 0.34375 1.8291634321212769
10 0.3125 1.8275481462478638
11 0.4375 1.4627727270126343
test 0 0.46875 1.6640005111694336
test 1 0.3125 1.970247745513916


Overall:  12%| | 47/400 [4:47:51<29:22:12, 299.52s/it, decoder_mask_ratio=0.75, epoch=46, lr=0.000294, 

{'train/loss': 3.710535801947117, 'train/recon_losses': nan, 'train/contrastive_losses': 3.710535801947117, 'train/num_steps': 1504, 'train/cos_sim_encoder_output': 0.9766072481870651, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9946854449808598, 'train/probe_losses': 1.768339067697525, 'train/probe_accs': 0.3567708333333333, 'test/probe_losses': 1.8171241283416748, 'test/probe_accs': 0.390625, 'lr': 0.00029355735160326013, 'epoch': 46, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.384782075881958
1 0.4375 2.1356632709503174
2 0.34375 2.0394375324249268
3 0.6875 1.6726198196411133
4 0.5625 1.65911066532135
5 0.40625 1.8690178394317627
6 0.34375 2.037233829498291
7 0.375 1.8581225872039795
8 0.40625 1.8230631351470947
9 0.5625 1.4143917560577393
10 0.375 1.5974255800247192
11 0.28125 2.1219048500061035
test 0 0.15625 2.1883342266082764
test 1 0.21875 2.0801968574523926


Overall:  12%| | 48/400 [4:50:55<25:52:39, 264.66s/it, decoder_mask_ratio=0.75, epoch=47, lr=0.000293, 

{'train/loss': 3.6309469640254974, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6309469640254974, 'train/num_steps': 1536, 'train/cos_sim_encoder_output': 0.9687620792537928, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9938028771430254, 'train/probe_losses': 1.8843977451324463, 'train/probe_accs': 0.421875, 'test/probe_losses': 2.1342655420303345, 'test/probe_accs': 0.1875, 'lr': 0.0002932023971707042, 'epoch': 47, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.974382996559143
1 0.03125 2.1078884601593018
2 0.3125 2.0505807399749756
3 0.1875 1.8664568662643433
4 0.3125 1.9437698125839233
5 0.21875 2.0491950511932373
6 0.34375 1.989284873008728
7 0.40625 1.9339996576309204
8 0.1875 2.1807451248168945
9 0.34375 1.6633139848709106
10 0.28125 1.7707018852233887
11 0.21875 1.7592382431030273
test 0 0.4375 1.8763071298599243
test 1 0.53125 1.630874514579773


Overall:  12%| | 49/400 [4:54:07<23:41:06, 242.92s/it, decoder_mask_ratio=0.75, epoch=48, lr=0.000293, 

{'train/loss': 3.6596779227256775, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6596779227256775, 'train/num_steps': 1568, 'train/cos_sim_encoder_output': 0.965597715228796, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9916046392172575, 'train/probe_losses': 1.940796474615733, 'train/probe_accs': 0.23697916666666666, 'test/probe_losses': 1.7535908222198486, 'test/probe_accs': 0.484375, 'lr': 0.00029283815092914425, 'epoch': 48, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.240156888961792
1 0.0 2.206007957458496
2 0.21875 2.362696409225464
3 0.09375 2.2254881858825684
4 0.34375 1.9177051782608032
5 0.125 1.8339087963104248
6 0.375 1.7210592031478882
7 0.21875 2.068464517593384
8 0.21875 2.022923469543457
9 0.15625 2.0270259380340576
10 0.09375 1.9221022129058838
11 0.0 2.1295950412750244
test 0 0.0625 2.410735845565796
test 1 0.046875 2.2344632148742676


Overall:  12%|▏| 50/400 [4:56:48<21:14:14, 218.44s/it, decoder_mask_ratio=0.75, epoch=49, lr=0.000292, 

{'train/loss': 3.7717237919569016, 'train/recon_losses': nan, 'train/contrastive_losses': 3.7717237919569016, 'train/num_steps': 1600, 'train/cos_sim_encoder_output': 0.9759638775140047, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9910781048238277, 'train/probe_losses': 2.056427816549937, 'train/probe_accs': 0.15364583333333334, 'test/probe_losses': 2.3225995302200317, 'test/probe_accs': 0.0546875, 'lr': 0.000292464636513997, 'epoch': 49, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.0628910064697266
1 0.125 1.9230173826217651
2 0.1875 1.8830089569091797
3 0.75 1.7098474502563477
4 0.375 1.8681542873382568
5 0.40625 1.7295936346054077
6 0.625 1.6584389209747314
7 0.59375 1.7505285739898682
8 0.625 1.690253496170044
9 0.4375 1.8519089221954346
10 0.84375 1.3814167976379395
11 0.78125 1.3390963077545166
test 0 0.25 1.8400678634643555
test 1 0.4375 1.5762864351272583


Overall:  13%|▏| 51/400 [5:00:33<21:22:10, 220.43s/it, decoder_mask_ratio=0.75, epoch=50, lr=0.000292, 

{'train/loss': 3.715432234108448, 'train/recon_losses': nan, 'train/contrastive_losses': 3.715432234108448, 'train/num_steps': 1632, 'train/cos_sim_encoder_output': 0.9673093445599079, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.995345251634717, 'train/probe_losses': 1.737346311410268, 'train/probe_accs': 0.484375, 'test/probe_losses': 1.7081771492958069, 'test/probe_accs': 0.34375, 'lr': 0.0002920818781620775, 'epoch': 50, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.2228598594665527
1 0.28125 2.1966233253479004
2 0.09375 2.168717622756958
3 0.03125 2.063716411590576
4 0.0625 2.0088613033294678
5 0.25 1.8233555555343628
6 0.3125 1.6622202396392822
7 0.125 1.731413722038269
8 0.28125 1.7007755041122437
9 0.21875 1.6623344421386719
10 0.09375 1.6120312213897705
11 0.25 1.4745267629623413
test 0 0.53125 2.003051519393921
test 1 0.40625 1.9993388652801514


Overall:  13%|▏| 52/400 [5:05:05<22:47:33, 235.79s/it, decoder_mask_ratio=0.75, epoch=51, lr=0.000292, 

{'train/loss': 3.6476460099220276, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6476460099220276, 'train/num_steps': 1664, 'train/cos_sim_encoder_output': 0.9649613592773676, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9936721418052912, 'train/probe_losses': 1.8606196641921997, 'train/probe_accs': 0.19010416666666666, 'test/probe_losses': 2.001195192337036, 'test/probe_accs': 0.46875, 'lr': 0.00029168990071002667, 'epoch': 51, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 1.9241313934326172
1 0.71875 1.844793438911438
2 0.25 1.9464572668075562
3 0.34375 2.1627798080444336
4 0.25 2.1284966468811035
5 0.3125 1.8204292058944702
6 0.25 1.7894715070724487
7 0.3125 1.722650408744812
8 0.25 1.7313307523727417
9 0.4375 1.66510009765625
10 0.625 1.6683510541915894
11 0.46875 1.7733269929885864
test 0 0.59375 2.15392804145813
test 1 0.84375 1.7636057138442993


Overall:  13%|▏| 53/400 [5:08:24<21:40:21, 224.84s/it, decoder_mask_ratio=0.75, epoch=52, lr=0.000291, 

{'train/loss': 3.6689338311553, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6689338311553, 'train/num_steps': 1696, 'train/cos_sim_encoder_output': 0.965994767844677, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9939139038324356, 'train/probe_losses': 1.8481098810831706, 'train/probe_accs': 0.359375, 'test/probe_losses': 1.9587668776512146, 'test/probe_accs': 0.71875, 'lr': 0.00029128872959269985, 'epoch': 52, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.419926643371582
1 0.0 2.404392719268799
2 0.0 2.280411720275879
3 0.21875 2.0487332344055176
4 0.15625 1.7805346250534058
5 0.796875 1.5437629222869873
6 0.75 1.4755531549453735
7 0.46875 1.8121140003204346
8 0.40625 1.9484221935272217
9 0.40625 1.917089819908142
10 0.25 2.1538922786712646
11 0.46875 1.7538641691207886
test 0 0.28125 1.8444066047668457
test 1 0.5625 1.3803508281707764


Overall:  14%|▏| 54/400 [5:10:57<19:32:18, 203.29s/it, decoder_mask_ratio=0.75, epoch=53, lr=0.000291, 

{'train/loss': 3.7223174571990967, 'train/recon_losses': nan, 'train/contrastive_losses': 3.7223174571990967, 'train/num_steps': 1728, 'train/cos_sim_encoder_output': 0.9728823527693748, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9956540632992983, 'train/probe_losses': 1.9615581234296162, 'train/probe_accs': 0.3268229166666667, 'test/probe_losses': 1.612378716468811, 'test/probe_accs': 0.421875, 'lr': 0.00029087839084151573, 'epoch': 53, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.015625 2.2481582164764404
1 0.0625 2.2012085914611816
2 0.15625 2.1250243186950684
3 0.53125 1.7836418151855469
4 0.3125 1.7785677909851074
5 0.5 1.9321151971817017
6 0.40625 1.9099873304367065
7 0.53125 1.5317655801773071
8 0.4375 1.7063730955123901
9 0.4375 1.654099464416504
10 0.5 1.6703397035598755
11 0.421875 2.0848920345306396
test 0 0.40625 1.5606038570404053
test 1 0.53125 1.4852259159088135


Overall:  14%|▏| 55/400 [5:16:21<22:56:44, 239.43s/it, decoder_mask_ratio=0.75, epoch=54, lr=0.00029, t

{'train/loss': 3.533989369869232, 'train/recon_losses': nan, 'train/contrastive_losses': 3.533989369869232, 'train/num_steps': 1760, 'train/cos_sim_encoder_output': 0.9619597159326077, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9944035448133945, 'train/probe_losses': 1.8855144282182057, 'train/probe_accs': 0.359375, 'test/probe_losses': 1.5229148864746094, 'test/probe_accs': 0.46875, 'lr': 0.0002904589110827679, 'epoch': 54, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.0230374336242676
1 0.25 1.939662218093872
2 0.6875 1.7681306600570679
3 0.546875 1.4946438074111938
4 0.625 1.2019932270050049
5 0.34375 1.8647481203079224
6 0.3125 2.1809518337249756
7 0.25 1.8527419567108154
8 0.1875 1.7935429811477661
9 0.4375 1.809309482574463
10 0.25 2.08950138092041
11 0.3125 2.051557779312134
test 0 0.3125 2.3059797286987305
test 1 0.15625 2.6960113048553467


Overall:  14%|▏| 56/400 [5:18:59<20:32:50, 215.03s/it, decoder_mask_ratio=0.75, epoch=55, lr=0.00029, t

{'train/loss': 3.6555802896618843, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6555802896618843, 'train/num_steps': 1792, 'train/cos_sim_encoder_output': 0.9564002938568592, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9940541386604309, 'train/probe_losses': 1.8391517400741577, 'train/probe_accs': 0.3580729166666667, 'test/probe_losses': 2.5009955167770386, 'test/probe_accs': 0.234375, 'lr': 0.00029003031753589676, 'epoch': 55, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.9977197647094727
1 0.0 2.033787250518799
2 0.0 2.062756061553955
3 0.75 1.6463161706924438
4 0.34375 2.00020170211792
5 0.75 1.278373122215271
6 0.5625 1.3984447717666626
7 0.4375 1.531488060951233
8 0.53125 1.6056634187698364
9 0.40625 1.6409432888031006
10 0.25 1.8357656002044678
11 0.1875 1.7116453647613525
test 0 0.46875 1.950638771057129
test 1 0.375 2.23587965965271


Overall:  14%|▏| 57/400 [5:21:39<18:54:26, 198.44s/it, decoder_mask_ratio=0.75, epoch=56, lr=0.00029, t

{'train/loss': 3.523111030459404, 'train/recon_losses': nan, 'train/contrastive_losses': 3.523111030459404, 'train/num_steps': 1824, 'train/cos_sim_encoder_output': 0.9543511290103197, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9945834893733263, 'train/probe_losses': 1.7285920480887096, 'train/probe_accs': 0.3515625, 'test/probe_losses': 2.0932592153549194, 'test/probe_accs': 0.421875, 'lr': 0.0002895926380117234, 'epoch': 56, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1689507961273193
1 0.0 2.2413012981414795
2 0.28125 2.12380313873291
3 0.3125 1.835876226425171
4 0.40625 1.544875979423523
5 0.34375 1.5041694641113281
6 0.28125 1.5112345218658447
7 0.625 1.481249213218689
8 0.5625 1.7643855810165405
9 0.6875 1.4796909093856812
10 0.375 1.986361026763916
11 0.28125 1.9324116706848145
test 0 0.03125 2.34125018119812
test 1 0.09375 2.6377854347229004


Overall:  14%|▏| 58/400 [5:25:22<19:34:11, 206.00s/it, decoder_mask_ratio=0.75, epoch=57, lr=0.000289, 

{'train/loss': 3.5527141019701958, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5527141019701958, 'train/num_steps': 1856, 'train/cos_sim_encoder_output': 0.9658660050481558, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9914464391767979, 'train/probe_losses': 1.7978591521581013, 'train/probe_accs': 0.3463541666666667, 'test/probe_losses': 2.4895178079605103, 'test/probe_accs': 0.0625, 'lr': 0.0002891459009106449, 'epoch': 57, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 1.9240148067474365
1 0.09375 1.9322552680969238
2 0.53125 1.8396300077438354
3 0.53125 1.7176315784454346
4 0.3125 1.9960436820983887
5 0.125 2.1873979568481445
6 0.21875 1.9198607206344604
7 0.46875 1.6002845764160156
8 0.59375 1.423296570777893
9 1.0 1.2990158796310425
10 0.90625 1.3631892204284668
11 0.9375 1.3542070388793945
test 0 0.5 1.7249994277954102
test 1 0.34375 1.8538471460342407


Overall:  15%|▏| 59/400 [5:29:36<20:51:47, 220.26s/it, decoder_mask_ratio=0.75, epoch=58, lr=0.000289, 

{'train/loss': 3.501707285642624, 'train/recon_losses': nan, 'train/contrastive_losses': 3.501707285642624, 'train/num_steps': 1888, 'train/cos_sim_encoder_output': 0.9641715064644814, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9937481936067343, 'train/probe_losses': 1.7130689422289531, 'train/probe_accs': 0.4973958333333333, 'test/probe_losses': 1.7894232869148254, 'test/probe_accs': 0.421875, 'lr': 0.0002886901352207915, 'epoch': 58, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.118579387664795
1 0.0 2.0987937450408936
2 0.5 1.962422490119934
3 0.46875 1.6725285053253174
4 0.3125 1.6382510662078857
5 0.25 1.6739577054977417
6 0.15625 2.096259355545044
7 0.1875 1.9468967914581299
8 0.171875 2.4134280681610107
9 0.3125 2.0198066234588623
10 0.40625 2.0038232803344727
11 0.328125 1.9514340162277222
test 0 0.3125 2.011869430541992
test 1 0.109375 2.240034580230713


Overall:  15%|▏| 60/400 [5:32:24<19:18:46, 204.49s/it, decoder_mask_ratio=0.75, epoch=59, lr=0.000288, 

{'train/loss': 3.662358559668064, 'train/recon_losses': nan, 'train/contrastive_losses': 3.662358559668064, 'train/num_steps': 1920, 'train/cos_sim_encoder_output': 0.9750576782971621, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9934833087027073, 'train/probe_losses': 1.9663484195868175, 'train/probe_accs': 0.2578125, 'test/probe_losses': 2.1259520053863525, 'test/probe_accs': 0.2109375, 'lr': 0.00028822537051614584, 'epoch': 59, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0994675159454346
1 0.0 2.113574981689453
2 0.15625 2.057162284851074
3 0.25 1.9856356382369995
4 0.28125 1.7669386863708496
5 0.4375 1.5818798542022705
6 0.53125 1.345472812652588
7 0.4375 1.3085554838180542
8 0.75 1.2063701152801514
9 0.59375 1.3948376178741455
10 0.875 1.0554959774017334
11 0.84375 1.2654907703399658
test 0 0.0 2.3595149517059326
test 1 0.0 2.3929224014282227


Overall:  15%|▏| 61/400 [5:35:33<18:50:25, 200.07s/it, decoder_mask_ratio=0.75, epoch=60, lr=0.000288, 

{'train/loss': 3.5305247828364372, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5305247828364372, 'train/num_steps': 1952, 'train/cos_sim_encoder_output': 0.9672293066978455, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9933706447482109, 'train/probe_losses': 1.5984068115552266, 'train/probe_accs': 0.4296875, 'test/probe_losses': 2.3762186765670776, 'test/probe_accs': 0.0, 'lr': 0.0002877516369546234, 'epoch': 60, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1969428062438965
1 0.0 2.179913282394409
2 0.046875 2.2519712448120117
3 0.6875 1.8207461833953857
4 0.65625 1.5232332944869995
5 0.5 1.846854567527771
6 0.40625 2.0393073558807373
7 0.28125 2.211122512817383
8 0.625 1.4435744285583496
9 0.5 1.5557554960250854
10 0.375 1.7381523847579956
11 0.3125 1.8067394495010376
test 0 0.03125 2.7246274948120117
test 1 0.21875 2.3237853050231934


Overall:  16%|▏| 62/400 [5:39:38<20:01:45, 213.33s/it, decoder_mask_ratio=0.75, epoch=61, lr=0.000287, 

{'train/loss': 3.514986552298069, 'train/recon_losses': nan, 'train/contrastive_losses': 3.514986552298069, 'train/num_steps': 1984, 'train/cos_sim_encoder_output': 0.9638486076146364, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9947818387299776, 'train/probe_losses': 1.8845260838667552, 'train/probe_accs': 0.3658854166666667, 'test/probe_losses': 2.5242063999176025, 'test/probe_accs': 0.125, 'lr': 0.0002872689652761161, 'epoch': 61, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.21875 2.070552110671997
1 0.46875 1.8905038833618164
2 0.28125 1.8041578531265259
3 0.28125 1.4909570217132568
4 0.78125 1.2146220207214355
5 0.65625 1.4540432691574097
6 0.4375 2.046204090118408
7 0.34375 2.311835527420044
8 0.15625 2.629270315170288
9 0.375 2.0638527870178223
10 0.46875 1.7822835445404053
11 0.5625 1.5991051197052002
test 0 0.09375 3.467421054840088
test 1 0.375 2.4639885425567627


Overall:  16%|▏| 63/400 [5:43:25<20:22:14, 217.61s/it, decoder_mask_ratio=0.75, epoch=62, lr=0.000287, 

{'train/loss': 3.4379311576485634, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4379311576485634, 'train/num_steps': 2016, 'train/cos_sim_encoder_output': 0.9493158552795649, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9952424950897694, 'train/probe_losses': 1.863115628560384, 'train/probe_accs': 0.4192708333333333, 'test/probe_losses': 2.9657047986984253, 'test/probe_accs': 0.234375, 'lr': 0.0002867773868004976, 'epoch': 62, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.427901029586792
1 0.0 2.224862575531006
2 0.34375 1.9900163412094116
3 0.5 1.864728331565857
4 0.5 1.739970326423645
5 0.46875 1.843306064605713
6 0.25 2.1298110485076904
7 0.53125 1.5084352493286133
8 0.34375 1.8480256795883179
9 0.125 2.1975996494293213
10 0.15625 2.0151827335357666
11 0.03125 2.2753257751464844
test 0 0.0 2.5728466510772705
test 1 0.0 2.532289981842041


Overall:  16%|▏| 64/400 [5:45:59<18:32:06, 198.59s/it, decoder_mask_ratio=0.75, epoch=63, lr=0.000286, 

{'train/loss': 3.5879433155059814, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5879433155059814, 'train/num_steps': 2048, 'train/cos_sim_encoder_output': 0.9634365849196911, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9926919005811214, 'train/probe_losses': 2.0054304003715515, 'train/probe_accs': 0.2708333333333333, 'test/probe_losses': 2.5525683164596558, 'test/probe_accs': 0.0, 'lr': 0.00028627693342559064, 'epoch': 63, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.121047258377075
1 0.0 2.1415255069732666
2 0.125 2.0595757961273193
3 0.65625 1.66765558719635
4 0.78125 1.3882091045379639
5 0.65625 1.564455509185791
6 0.3125 2.1976656913757324
7 0.125 2.3627257347106934
8 0.21875 2.057905673980713
9 0.28125 1.9310442209243774
10 0.15625 2.1248114109039307
11 0.1875 2.0801568031311035
test 0 0.09375 1.9494805335998535
test 1 0.0625 1.9793505668640137


Overall:  16%|▏| 65/400 [5:48:31<17:10:16, 184.53s/it, decoder_mask_ratio=0.75, epoch=64, lr=0.000286, 

{'train/loss': 3.48332529515028, 'train/recon_losses': nan, 'train/contrastive_losses': 3.48332529515028, 'train/num_steps': 2080, 'train/cos_sim_encoder_output': 0.9667038843035698, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9940089322626591, 'train/probe_losses': 1.9747315247853596, 'train/probe_accs': 0.2916666666666667, 'test/probe_losses': 1.9644155502319336, 'test/probe_accs': 0.078125, 'lr': 0.00028576763762509755, 'epoch': 64, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.75 1.8390246629714966
1 0.75 1.7972698211669922
2 0.59375 1.8438435792922974
3 0.53125 1.7286791801452637
4 0.3125 1.9908099174499512
5 0.46875 1.6540316343307495
6 0.65625 1.4023329019546509
7 0.28125 1.5377458333969116
8 0.5625 1.6730413436889648
9 0.421875 2.1508123874664307
10 0.359375 2.178445339202881
11 0.53125 1.8348536491394043
test 0 0.34375 1.4088221788406372
test 1 0.3125 1.5108193159103394


Overall:  16%|▏| 66/400 [5:51:03<16:11:58, 174.61s/it, decoder_mask_ratio=0.75, epoch=65, lr=0.000285, 

{'train/loss': 3.5050802677869797, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5050802677869797, 'train/num_steps': 2112, 'train/cos_sim_encoder_output': 0.9636393580585718, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9956846013665199, 'train/probe_losses': 1.8025741875171661, 'train/probe_accs': 0.5182291666666666, 'test/probe_losses': 1.4598207473754883, 'test/probe_accs': 0.328125, 'lr': 0.00028524953244649293, 'epoch': 65, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.1875 2.062575578689575
1 0.25 2.0878090858459473
2 0.15625 1.9427907466888428
3 0.4375 1.732448697090149
4 0.671875 1.5041828155517578
5 0.640625 1.310569405555725
6 0.4375 1.6770907640457153
7 0.34375 2.041975259780884
8 0.34375 1.8669079542160034
9 0.21875 2.3856537342071533
10 0.4375 1.8854210376739502
11 0.25 1.7510007619857788
test 0 0.0625 2.4563536643981934
test 1 0.15625 2.2727112770080566


Overall:  17%|▏| 67/400 [5:53:32<15:27:00, 167.03s/it, decoder_mask_ratio=0.75, epoch=66, lr=0.000285, 

{'train/loss': 3.370360516011715, 'train/recon_losses': nan, 'train/contrastive_losses': 3.370360516011715, 'train/num_steps': 2144, 'train/cos_sim_encoder_output': 0.9486802816390991, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9939690381288528, 'train/probe_losses': 1.8540354867776234, 'train/probe_accs': 0.3645833333333333, 'test/probe_losses': 2.364532470703125, 'test/probe_accs': 0.109375, 'lr': 0.00028472265150887945, 'epoch': 66, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.03125 2.0183498859405518
1 0.0 2.042738914489746
2 0.09375 2.0891902446746826
3 0.1875 1.8580704927444458
4 0.59375 1.7383828163146973
5 0.53125 1.5665394067764282
6 0.5625 1.5977739095687866
7 0.4375 1.4866925477981567
8 0.5 1.5354150533676147
9 0.40625 1.7704132795333862
10 0.21875 1.7330436706542969
11 0.15625 1.3363031148910522
test 0 0.15625 1.7519707679748535
test 1 0.5 1.7386351823806763


Overall:  17%|▏| 68/400 [5:56:07<15:03:44, 163.33s/it, decoder_mask_ratio=0.75, epoch=67, lr=0.000284, 

{'train/loss': 3.481097564101219, 'train/recon_losses': nan, 'train/contrastive_losses': 3.481097564101219, 'train/num_steps': 2176, 'train/cos_sim_encoder_output': 0.9667357932776213, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9935723841190338, 'train/probe_losses': 1.7310761113961537, 'train/probe_accs': 0.3098958333333333, 'test/probe_losses': 1.745302975177765, 'test/probe_accs': 0.328125, 'lr': 0.000284187029000806, 'epoch': 67, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.4245805740356445
1 0.0 2.3289332389831543
2 0.0 2.178845167160034
3 0.21875 2.0088868141174316
4 0.1875 1.9669467210769653
5 0.46875 2.0227861404418945
6 0.5625 1.8046374320983887
7 0.46875 1.8076448440551758
8 0.65625 1.7202140092849731
9 0.4375 2.0165557861328125
10 0.15625 2.4055230617523193
11 0.125 2.4614810943603516
test 0 0.1875 2.3884716033935547
test 1 0.40625 2.1291756629943848


Overall:  17%|▏| 69/400 [5:59:25<15:59:31, 173.93s/it, decoder_mask_ratio=0.75, epoch=68, lr=0.000284, 

{'train/loss': 3.4535893723368645, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4535893723368645, 'train/num_steps': 2208, 'train/cos_sim_encoder_output': 0.9572942201048136, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9944268502295017, 'train/probe_losses': 2.0955862402915955, 'train/probe_accs': 0.2734375, 'test/probe_losses': 2.2588236331939697, 'test/probe_accs': 0.296875, 'lr': 0.0002836426996780496, 'epoch': 68, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1554648876190186
1 0.0 2.1991429328918457
2 0.09375 2.2357778549194336
3 0.3125 2.068112850189209
4 0.21875 1.859412431716919
5 0.28125 1.6453450918197632
6 0.265625 1.733610987663269
7 0.125 1.8099431991577148
8 0.296875 2.0367937088012695
9 0.40625 1.8526479005813599
10 0.21875 2.068690061569214
11 0.40625 1.7173182964324951
test 0 0.5 1.8215751647949219
test 1 0.46875 1.7442246675491333


Overall:  18%|▏| 70/400 [6:02:36<16:24:28, 178.99s/it, decoder_mask_ratio=0.75, epoch=69, lr=0.000283, 

{'train/loss': 3.4434228986501694, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4434228986501694, 'train/num_steps': 2240, 'train/cos_sim_encoder_output': 0.9633635822683573, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9916928596794605, 'train/probe_losses': 1.9485216836134593, 'train/probe_accs': 0.21875, 'test/probe_losses': 1.7828999161720276, 'test/probe_accs': 0.484375, 'lr': 0.00028308969886136, 'epoch': 69, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0266520977020264
1 0.0 2.0657460689544678
2 0.359375 1.9868561029434204
3 0.3125 1.9552408456802368
4 0.125 2.0746099948883057
5 0.03125 1.9482285976409912
6 0.078125 1.8144361972808838
7 0.359375 1.6161305904388428
8 0.3125 1.6224416494369507
9 0.28125 1.7224884033203125
10 0.375 1.669658899307251
11 0.3125 1.746803641319275
test 0 0.0 2.2138688564300537
test 1 0.0 2.334442377090454


Overall:  18%|▏| 71/400 [6:08:41<21:27:22, 234.78s/it, decoder_mask_ratio=0.75, epoch=70, lr=0.000283, 

{'train/loss': 3.4920853674411774, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4920853674411774, 'train/num_steps': 2272, 'train/cos_sim_encoder_output': 0.9426030199974775, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.993561141192913, 'train/probe_losses': 1.8541077574094136, 'train/probe_accs': 0.21223958333333334, 'test/probe_losses': 2.274155616760254, 'test/probe_accs': 0.0, 'lr': 0.0002825280624341676, 'epoch': 70, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2486042976379395
1 0.0 2.1801187992095947
2 0.0 2.169511556625366
3 0.21875 1.8490012884140015
4 0.59375 1.6266099214553833
5 0.5625 1.185262680053711
6 0.53125 1.2377314567565918
7 0.3125 1.9296730756759644
8 0.34375 2.083081007003784
9 0.1875 2.3498802185058594
10 0.125 2.561767578125
11 0.3125 2.262777090072632
test 0 0.21875 2.4669346809387207
test 1 0.125 2.661878824234009


Overall:  18%|▏| 72/400 [6:12:33<21:18:50, 233.93s/it, decoder_mask_ratio=0.75, epoch=71, lr=0.000282, 

{'train/loss': 3.677291192114353, 'train/recon_losses': nan, 'train/contrastive_losses': 3.677291192114353, 'train/num_steps': 2304, 'train/cos_sim_encoder_output': 0.9608419518917799, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9948207046836615, 'train/probe_losses': 1.973668247461319, 'train/probe_accs': 0.265625, 'test/probe_losses': 2.5644067525863647, 'test/probe_accs': 0.171875, 'lr': 0.00028195782684025533, 'epoch': 71, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.1880388259887695
1 0.09375 2.1907992362976074
2 0.40625 1.9594709873199463
3 0.25 1.8413119316101074
4 0.1875 1.656342625617981
5 0.46875 1.7542660236358643
6 0.375 2.0899224281311035
7 0.375 1.8817696571350098
8 0.3125 1.8175369501113892
9 0.25 1.744739055633545
10 0.375 1.5100438594818115
11 0.21875 1.7167699337005615
test 0 0.3125 2.2078685760498047
test 1 0.53125 1.8094897270202637


Overall:  18%|▏| 73/400 [6:15:11<19:11:21, 211.26s/it, decoder_mask_ratio=0.75, epoch=72, lr=0.000281, 

{'train/loss': 3.3990655317902565, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3990655317902565, 'train/num_steps': 2336, 'train/cos_sim_encoder_output': 0.9602761175483465, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9919344708323479, 'train/probe_losses': 1.8625842928886414, 'train/probe_accs': 0.2890625, 'test/probe_losses': 2.008679151535034, 'test/probe_accs': 0.421875, 'lr': 0.00028137902908139376, 'epoch': 72, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.03125 2.310249090194702
1 0.03125 2.2808725833892822
2 0.0625 2.1195151805877686
3 0.125 2.029428005218506
4 0.25 2.0669147968292236
5 0.1875 1.9427893161773682
6 0.25 1.7130508422851562
7 0.28125 1.9148712158203125
8 0.65625 1.5994207859039307
9 0.59375 1.9538389444351196
10 0.515625 2.1294543743133545
11 0.375 2.4149234294891357
test 0 0.65625 1.5829668045043945
test 1 0.546875 2.021575689315796


Overall:  18%|▏| 74/400 [6:18:05<18:05:40, 199.82s/it, decoder_mask_ratio=0.75, epoch=73, lr=0.000281, 

{'train/loss': 3.483862303197384, 'train/recon_losses': nan, 'train/contrastive_losses': 3.483862303197384, 'train/num_steps': 2368, 'train/cos_sim_encoder_output': 0.961068119853735, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9914119802415371, 'train/probe_losses': 2.0396107137203217, 'train/probe_accs': 0.2799479166666667, 'test/probe_losses': 1.8022712469100952, 'test/probe_accs': 0.6015625, 'lr': 0.00028079170671494, 'epoch': 73, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.5 1.8842579126358032
1 0.4375 1.9435213804244995
2 0.28125 1.979866623878479
3 0.15625 2.0231595039367676
4 0.25 1.9832994937896729
5 0.03125 1.96138334274292
6 0.28125 2.4087605476379395
7 0.15625 2.4545955657958984
8 0.296875 2.2179338932037354
9 0.09375 2.58335542678833
10 0.1875 2.291118860244751
11 0.21875 2.149362564086914
test 0 0.25 2.104281425476074
test 1 0.46875 1.8062301874160767


Overall:  19%|▏| 75/400 [6:20:56<17:15:48, 191.23s/it, decoder_mask_ratio=0.75, epoch=74, lr=0.00028, t

{'train/loss': 3.651124820113182, 'train/recon_losses': nan, 'train/contrastive_losses': 3.651124820113182, 'train/num_steps': 2400, 'train/cos_sim_encoder_output': 0.9659882131963968, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9957453776150942, 'train/probe_losses': 2.156717926263809, 'train/probe_accs': 0.24088541666666666, 'test/probe_losses': 1.9552558064460754, 'test/probe_accs': 0.359375, 'lr': 0.00028019589785140077, 'epoch': 74, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2831876277923584
1 0.25 2.127288341522217
2 0.0 2.136812210083008
3 0.125 2.020203113555908
4 0.25 1.8928852081298828
5 0.03125 1.8977903127670288
6 0.34375 1.5593140125274658
7 0.671875 1.5516653060913086
8 0.625 1.5960979461669922
9 0.65625 1.505667805671692
10 0.46875 1.4627671241760254
11 0.25 1.8029142618179321
test 0 0.3125 2.1344475746154785
test 1 0.125 2.13460636138916


Overall:  19%|▏| 76/400 [6:23:37<16:24:21, 182.29s/it, decoder_mask_ratio=0.75, epoch=75, lr=0.00028, t

{'train/loss': 3.6477276608347893, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6477276608347893, 'train/num_steps': 2432, 'train/cos_sim_encoder_output': 0.9694985505193472, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.993285646662116, 'train/probe_losses': 1.819716105858485, 'train/probe_accs': 0.3059895833333333, 'test/probe_losses': 2.1345269680023193, 'test/probe_accs': 0.21875, 'lr': 0.00027959164115195926, 'epoch': 75, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1164309978485107
1 0.0 2.106992244720459
2 0.171875 1.952906608581543
3 0.09375 1.9128199815750122
4 0.6875 1.648160457611084
5 0.21875 1.8318778276443481
6 0.28125 1.780396580696106
7 0.25 1.5728206634521484
8 0.421875 1.6054728031158447
9 0.53125 1.6555202007293701
10 0.21875 2.254915714263916
11 0.25 2.188107967376709
test 0 0.03125 2.2320570945739746
test 1 0.21875 1.9722625017166138


Overall:  19%|▏| 77/400 [6:26:58<16:51:37, 187.92s/it, decoder_mask_ratio=0.75, epoch=76, lr=0.000279, 

{'train/loss': 3.5485326424241066, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5485326424241066, 'train/num_steps': 2464, 'train/cos_sim_encoder_output': 0.9597698785364628, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9924847651273012, 'train/probe_losses': 1.8855351706345875, 'train/probe_accs': 0.2604166666666667, 'test/probe_losses': 2.102159798145294, 'test/probe_accs': 0.125, 'lr': 0.00027897897582596694, 'epoch': 76, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.03125 2.2132625579833984
1 0.1875 2.2498788833618164
2 0.46875 1.880492091178894
3 0.28125 1.9358330965042114
4 0.3125 1.7351665496826172
5 0.34375 1.6096796989440918
6 0.28125 1.4859578609466553
7 0.625 1.2750896215438843
8 0.4375 1.5248210430145264
9 0.5 1.7777490615844727
10 0.34375 2.570004940032959
11 0.21875 2.5099596977233887
test 0 0.0 2.4330222606658936
test 1 0.0 2.626142740249634


Overall:  20%|▏| 78/400 [6:30:17<17:06:36, 191.29s/it, decoder_mask_ratio=0.75, epoch=77, lr=0.000278, 

{'train/loss': 3.471140146255493, 'train/recon_losses': nan, 'train/contrastive_losses': 3.471140146255493, 'train/num_steps': 2496, 'train/cos_sim_encoder_output': 0.9614818729460239, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9887901898473501, 'train/probe_losses': 1.8973245918750763, 'train/probe_accs': 0.3359375, 'test/probe_losses': 2.5295825004577637, 'test/probe_accs': 0.0, 'lr': 0.00027835794162839864, 'epoch': 77, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1752712726593018
1 0.0 2.083728075027466
2 0.125 2.1275124549865723
3 0.4375 1.7247607707977295
4 0.46875 1.8010214567184448
5 0.40625 1.6433576345443726
6 0.21875 1.9677335023880005
7 0.59375 1.4378411769866943
8 0.3125 1.6759114265441895
9 0.375 1.631523609161377
10 0.28125 1.7545499801635742
11 0.46875 1.4758377075195312
test 0 0.375 1.9428883790969849
test 1 0.28125 2.282148599624634


Overall:  20%|▏| 79/400 [6:32:55<16:09:39, 181.25s/it, decoder_mask_ratio=0.75, epoch=78, lr=0.000278, 

{'train/loss': 3.4912347868084908, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4912347868084908, 'train/num_steps': 2528, 'train/cos_sim_encoder_output': 0.9350546393543482, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9791892077773809, 'train/probe_losses': 1.7915874222914379, 'train/probe_accs': 0.3072916666666667, 'test/probe_losses': 2.1125184893608093, 'test/probe_accs': 0.328125, 'lr': 0.0002777285788572737, 'epoch': 78, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.1464486122131348
1 0.09375 2.1234970092773438
2 0.03125 2.147423505783081
3 0.3125 1.9266425371170044
4 0.40625 1.7471635341644287
5 0.34375 1.689058780670166
6 0.484375 1.5535434484481812
7 0.6875 1.7080167531967163
8 0.6875 1.4910497665405273
9 0.53125 1.530253529548645
10 0.40625 1.7656400203704834
11 0.28125 1.9637608528137207
test 0 0.28125 1.7057411670684814
test 1 0.21875 2.1578681468963623


Overall:  20%|▏| 80/400 [6:35:40<15:40:18, 176.31s/it, decoder_mask_ratio=0.75, epoch=79, lr=0.000277, 

{'train/loss': 3.4025334790349007, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4025334790349007, 'train/num_steps': 2560, 'train/cos_sim_encoder_output': 0.9550497084856033, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9905468765646219, 'train/probe_losses': 1.8160415291786194, 'train/probe_accs': 0.36328125, 'test/probe_losses': 1.9318046569824219, 'test/probe_accs': 0.25, 'lr': 0.0002770909283510402, 'epoch': 79, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.5464444160461426
1 0.0 2.4128811359405518
2 0.0 2.318953514099121
3 0.5 2.005981683731079
4 0.3125 1.926561713218689
5 0.21875 1.793287754058838
6 0.4375 1.918318748474121
7 0.46875 1.8489489555358887
8 0.34375 2.0349042415618896
9 0.09375 2.3473563194274902
10 0.3125 1.7805070877075195
11 0.25 1.9462376832962036
test 0 0.421875 1.4031814336776733
test 1 0.234375 1.8563967943191528


Overall:  20%|▏| 81/400 [6:38:24<15:17:58, 172.66s/it, decoder_mask_ratio=0.75, epoch=80, lr=0.000276, 

{'train/loss': 3.497259773313999, 'train/recon_losses': nan, 'train/contrastive_losses': 3.497259773313999, 'train/num_steps': 2592, 'train/cos_sim_encoder_output': 0.9648640044033527, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9922468662261963, 'train/probe_losses': 2.073365271091461, 'train/probe_accs': 0.24479166666666666, 'test/probe_losses': 1.629789113998413, 'test/probe_accs': 0.328125, 'lr': 0.00027644503148592585, 'epoch': 80, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.46875 1.9355419874191284
1 0.28125 2.1423490047454834
2 0.375 1.966219425201416
3 0.21875 1.948918342590332
4 0.21875 1.7397129535675049
5 0.1875 1.9204705953598022
6 0.25 1.7558727264404297
7 0.25 1.8457026481628418
8 0.28125 1.9346592426300049
9 0.21875 1.9275870323181152
10 0.0625 2.2799816131591797
11 0.0625 2.3783271312713623
test 0 0.109375 1.7459367513656616
test 1 0.0625 1.7629196643829346


Overall:  20%|▏| 82/400 [6:41:46<16:02:19, 181.57s/it, decoder_mask_ratio=0.75, epoch=81, lr=0.000276, 

{'train/loss': 3.520192474126816, 'train/recon_losses': nan, 'train/contrastive_losses': 3.520192474126816, 'train/num_steps': 2624, 'train/cos_sim_encoder_output': 0.963735643774271, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9932053182274103, 'train/probe_losses': 1.9812785585721333, 'train/probe_accs': 0.23958333333333334, 'test/probe_losses': 1.754428207874298, 'test/probe_accs': 0.0859375, 'lr': 0.00027579093017325254, 'epoch': 81, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.615309238433838
1 0.0 2.7540152072906494
2 0.0 2.4837749004364014
3 0.15625 2.05312442779541
4 0.71875 1.7777142524719238
5 0.34375 1.78849458694458
6 0.21875 1.7916632890701294
7 0.25 1.9637677669525146
8 0.03125 2.4718894958496094
9 0.296875 1.9381827116012573
10 0.34375 1.909850835800171
11 0.265625 2.165557622909546
test 0 0.3125 2.1136367321014404
test 1 0.28125 2.3168983459472656


Overall:  21%|▏| 83/400 [6:44:46<15:55:41, 180.89s/it, decoder_mask_ratio=0.75, epoch=82, lr=0.000275, 

{'train/loss': 3.5259661749005318, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5259661749005318, 'train/num_steps': 2656, 'train/cos_sim_encoder_output': 0.9591170400381088, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9948381073772907, 'train/probe_losses': 2.142778694629669, 'train/probe_accs': 0.21875, 'test/probe_losses': 2.215267539024353, 'test/probe_accs': 0.296875, 'lr': 0.0002751286668567171, 'epoch': 82, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 2.2087206840515137
1 0.28125 2.131437301635742
2 0.125 2.1448822021484375
3 0.25 1.9623905420303345
4 0.21875 2.016615629196167
5 0.28125 1.9358036518096924
6 0.34375 1.8162798881530762
7 0.40625 1.7231966257095337
8 0.15625 2.1835033893585205
9 0.09375 2.2661378383636475
10 0.15625 2.0454111099243164
11 0.125 2.0678298473358154
test 0 0.0 1.9523075819015503
test 1 0.0 2.19657301902771


Overall:  21%|▏| 84/400 [6:47:33<15:30:43, 176.72s/it, decoder_mask_ratio=0.75, epoch=83, lr=0.000274, 

{'train/loss': 3.5420797243714333, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5420797243714333, 'train/num_steps': 2688, 'train/cos_sim_encoder_output': 0.970984498038888, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9907162897288799, 'train/probe_losses': 2.041850725809733, 'train/probe_accs': 0.21354166666666666, 'test/probe_losses': 2.07444030046463, 'test/probe_accs': 0.0, 'lr': 0.00027445828450963714, 'epoch': 83, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.0216870307922363
1 0.125 2.0594213008880615
2 0.15625 1.9023277759552002
3 0.265625 1.69732666015625
4 0.625 1.7731112241744995
5 0.46875 1.7629088163375854
6 0.25 1.808825969696045
7 0.21875 1.7796932458877563
8 0.15625 1.777557611465454
9 0.53125 1.626122236251831
10 0.71875 1.4268516302108765
11 0.65625 1.3659065961837769
test 0 0.4375 1.8827484846115112
test 1 0.75 1.1899394989013672


Overall:  21%|▏| 85/400 [6:50:08<14:53:33, 170.20s/it, decoder_mask_ratio=0.75, epoch=84, lr=0.000274, 

{'train/loss': 3.574447438120842, 'train/recon_losses': nan, 'train/contrastive_losses': 3.574447438120842, 'train/num_steps': 2720, 'train/cos_sim_encoder_output': 0.9611107800155878, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9926102459430695, 'train/probe_losses': 1.750145008166631, 'train/probe_accs': 0.37109375, 'test/probe_losses': 1.5363439917564392, 'test/probe_accs': 0.59375, 'lr': 0.0002737798266321624, 'epoch': 84, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.129628896713257
1 0.03125 2.113811731338501
2 0.0 1.9239373207092285
3 0.15625 1.8085262775421143
4 0.3125 1.9929102659225464
5 0.1875 1.9404001235961914
6 0.40625 1.6248993873596191
7 0.21875 1.893409013748169
8 0.25 1.7470076084136963
9 0.25 1.5821806192398071
10 0.71875 1.4261181354522705
11 0.4375 1.8240861892700195
test 0 0.09375 2.2563302516937256
test 1 0.125 2.466012954711914


Overall:  22%|▏| 86/400 [6:52:48<14:34:22, 167.08s/it, decoder_mask_ratio=0.75, epoch=85, lr=0.000273, 

{'train/loss': 3.430691733956337, 'train/recon_losses': nan, 'train/contrastive_losses': 3.430691733956337, 'train/num_steps': 2752, 'train/cos_sim_encoder_output': 0.9592140801250935, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9911342356353998, 'train/probe_losses': 1.8339096307754517, 'train/probe_accs': 0.2526041666666667, 'test/probe_losses': 2.36117160320282, 'test/probe_accs': 0.109375, 'lr': 0.00027309333724845233, 'epoch': 85, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.375 1.9886634349822998
1 0.4375 1.920386552810669
2 0.34375 1.789245843887329
3 0.15625 1.6796748638153076
4 0.3125 1.979638695716858
5 0.40625 1.5372228622436523
6 0.375 1.7442351579666138
7 0.28125 1.870185136795044
8 0.09375 2.223820447921753
9 0.1875 1.9066063165664673
10 0.03125 2.5381109714508057
11 0.0625 2.714468002319336
test 0 0.21875 1.969651460647583
test 1 0.34375 1.5944479703903198


Overall:  22%|▏| 87/400 [6:55:32<14:28:17, 166.45s/it, decoder_mask_ratio=0.75, epoch=86, lr=0.000272, 

{'train/loss': 3.4697273075580597, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4697273075580597, 'train/num_steps': 2784, 'train/cos_sim_encoder_output': 0.9573610182851553, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9928163457661867, 'train/probe_losses': 1.9910215238730113, 'train/probe_accs': 0.2552083333333333, 'test/probe_losses': 1.7820497155189514, 'test/probe_accs': 0.28125, 'lr': 0.00027239886090381913, 'epoch': 86, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 1.8740805387496948
1 0.21875 1.986635446548462
2 0.4375 1.7480260133743286
3 0.34375 1.6986085176467896
4 0.5 1.8588467836380005
5 0.1875 1.8445837497711182
6 0.1875 1.8266456127166748
7 0.359375 1.388288140296936
8 0.3125 1.6292872428894043
9 0.1875 1.7861558198928833
10 0.1875 1.6941636800765991
11 0.3125 1.6309692859649658
test 0 0.125 1.910718321800232
test 1 0.1875 1.7433091402053833


Overall:  22%|▏| 88/400 [6:59:34<16:22:17, 188.90s/it, decoder_mask_ratio=0.75, epoch=87, lr=0.000272, 

{'train/loss': 3.450582318007946, 'train/recon_losses': nan, 'train/contrastive_losses': 3.450582318007946, 'train/num_steps': 2816, 'train/cos_sim_encoder_output': 0.9500014409422874, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9895587731152773, 'train/probe_losses': 1.7471909026304882, 'train/probe_accs': 0.29296875, 'test/probe_losses': 1.8270137310028076, 'test/probe_accs': 0.15625, 'lr': 0.0002716964426618378, 'epoch': 87, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.2107715606689453
1 0.125 2.12896466255188
2 0.34375 2.0114731788635254
3 0.1875 1.7852940559387207
4 0.625 1.4589630365371704
5 0.578125 1.3211262226104736
6 0.25 1.6386429071426392
7 0.21875 1.7851370573043823
8 0.28125 1.828049659729004
9 0.1875 1.8904297351837158
10 0.0625 2.212740182876587
11 0.140625 2.100574016571045
test 0 0.15625 2.378267288208008
test 1 0.1875 2.2292563915252686


Overall:  22%|▏| 89/400 [7:02:31<16:00:59, 185.40s/it, decoder_mask_ratio=0.75, epoch=88, lr=0.000271, 

{'train/loss': 3.434016525745392, 'train/recon_losses': nan, 'train/contrastive_losses': 3.434016525745392, 'train/num_steps': 2848, 'train/cos_sim_encoder_output': 0.9565634988248348, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9853475000709295, 'train/probe_losses': 1.8643471896648407, 'train/probe_accs': 0.2552083333333333, 'test/probe_losses': 2.303761839866638, 'test/probe_accs': 0.171875, 'lr': 0.00027098612810142126, 'epoch': 88, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2493650913238525
1 0.0 2.149527072906494
2 0.0 1.9731152057647705
3 0.1875 1.8346836566925049
4 0.25 1.814103364944458
5 0.3125 1.88827645778656
6 0.21875 1.6527643203735352
7 0.53125 1.7932310104370117
8 0.6875 1.2678507566452026
9 0.71875 1.3292160034179688
10 0.46875 1.6414551734924316
11 0.25 1.594714879989624
test 0 0.40625 1.925216555595398
test 1 0.375 1.6983449459075928


Overall:  22%|▏| 90/400 [7:05:10<15:16:16, 177.34s/it, decoder_mask_ratio=0.75, epoch=89, lr=0.00027, t

{'train/loss': 3.5061685666441917, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5061685666441917, 'train/num_steps': 2880, 'train/cos_sim_encoder_output': 0.9584487769752741, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.986154505982995, 'train/probe_losses': 1.765691916147868, 'train/probe_accs': 0.3020833333333333, 'test/probe_losses': 1.8117807507514954, 'test/probe_accs': 0.390625, 'lr': 0.0002702679633138635, 'epoch': 89, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.24166202545166
1 0.09375 2.1340510845184326
2 0.0 2.0022060871124268
3 0.5625 1.6946232318878174
4 0.40625 1.6049387454986572
5 0.3125 1.5876295566558838
6 0.625 1.4606707096099854
7 0.40625 1.6067304611206055
8 0.46875 1.660677194595337
9 0.34375 2.1718010902404785
10 0.125 2.281721830368042
11 0.15625 2.7146835327148438
test 0 0.0 2.7640411853790283
test 1 0.03125 2.7486956119537354


Overall:  23%|▏| 91/400 [7:08:05<15:10:27, 176.79s/it, decoder_mask_ratio=0.75, epoch=90, lr=0.00027, t

{'train/loss': 3.5778543651103973, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5778543651103973, 'train/num_steps': 2912, 'train/cos_sim_encoder_output': 0.9609407652169466, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9875290859490633, 'train/probe_losses': 1.9301162958145142, 'train/probe_accs': 0.2994791666666667, 'test/probe_losses': 2.756368398666382, 'test/probe_accs': 0.015625, 'lr': 0.0002695419948998485, 'epoch': 90, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.8841444253921509
1 0.0 1.9621416330337524
2 0.25 2.0426697731018066
3 0.25 1.7989263534545898
4 0.375 1.732908010482788
5 0.21875 2.041592597961426
6 0.1875 1.8564507961273193
7 0.09375 1.9304003715515137
8 0.125 2.1392295360565186
9 0.1875 1.9493829011917114
10 0.3125 1.9721980094909668
11 0.21875 2.026366710662842
test 0 0.65625 1.246083378791809


Overall:  23%|▏| 92/400 [7:11:15<15:27:02, 180.59s/it, decoder_mask_ratio=0.75, epoch=91, lr=0.000269, 

test 1 0.21875 1.7301530838012695
{'train/loss': 3.4271460995078087, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4271460995078087, 'train/num_steps': 2944, 'train/cos_sim_encoder_output': 0.9597031865268946, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.986525846645236, 'train/probe_losses': 1.944700926542282, 'train/probe_accs': 0.18489583333333334, 'test/probe_losses': 1.4881182312965393, 'test/probe_accs': 0.4375, 'lr': 0.00026880826996642616, 'epoch': 91, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.31388521194458
1 0.375 2.0694282054901123
2 0.40625 1.8853981494903564
3 0.1875 1.8292399644851685
4 0.3125 1.5003606081008911
5 0.78125 1.4583872556686401
6 0.4375 1.972822666168213
7 0.40625 2.0311007499694824
8 0.171875 2.351980447769165
9 0.25 2.2977654933929443
10 0.234375 1.9865524768829346
11 0.125 2.026702642440796
test 0 0.03125 2.878419876098633
test 1 0.03125 2.684267282485962


Overall:  23%|▏| 93/400 [7:13:54<14:51:33, 174.25s/it, decoder_mask_ratio=0.75, epoch=92, lr=0.000268, 

{'train/loss': 3.39670280367136, 'train/recon_losses': nan, 'train/contrastive_losses': 3.39670280367136, 'train/num_steps': 2976, 'train/cos_sim_encoder_output': 0.9530757237225771, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9863450340926647, 'train/probe_losses': 1.976968655983607, 'train/probe_accs': 0.3203125, 'test/probe_losses': 2.7813435792922974, 'test/probe_accs': 0.03125, 'lr': 0.00026806683612395595, 'epoch': 92, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.296645402908325
1 0.03125 2.2244603633880615
2 0.125 1.9786152839660645
3 0.59375 1.7005512714385986
4 0.6875 1.3743869066238403
5 0.59375 1.3291778564453125
6 0.875 0.9264823198318481
7 0.65625 1.2273221015930176
8 0.4375 1.718697190284729
9 0.4375 1.692299723625183
10 0.40625 1.6275577545166016
11 0.28125 1.7451508045196533
test 0 0.1875 2.1895198822021484
test 1 0.53125 1.5636227130889893


Overall:  24%|▏| 94/400 [7:17:49<16:21:08, 192.38s/it, decoder_mask_ratio=0.75, epoch=93, lr=0.000267, 

{'train/loss': 3.4087361246347427, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4087361246347427, 'train/num_steps': 3008, 'train/cos_sim_encoder_output': 0.9554634056985378, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9893403891474009, 'train/probe_losses': 1.653445581595103, 'train/probe_accs': 0.4270833333333333, 'test/probe_losses': 1.8765712976455688, 'test/probe_accs': 0.359375, 'lr': 0.0002673177414830175, 'epoch': 93, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 1.975584626197815
1 0.1875 1.9692597389221191
2 0.390625 1.9492980241775513
3 0.25 1.7315986156463623
4 0.34375 1.496881365776062
5 0.3125 1.3834000825881958
6 1.0 1.0801317691802979
7 0.71875 1.2641053199768066
8 0.5625 1.5091748237609863
9 0.40625 1.8184016942977905
10 0.3125 1.6509350538253784
11 0.5 1.3718069791793823
test 0 0.15625 2.206597328186035
test 1 0.15625 1.9461674690246582


Overall:  24%|▏| 95/400 [7:20:28<15:27:01, 182.37s/it, decoder_mask_ratio=0.75, epoch=94, lr=0.000267, 

{'train/loss': 3.4692534878849983, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4692534878849983, 'train/num_steps': 3040, 'train/cos_sim_encoder_output': 0.9673946276307106, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9888940490782261, 'train/probe_losses': 1.600048174460729, 'train/probe_accs': 0.42578125, 'test/probe_losses': 2.0763823986053467, 'test/probe_accs': 0.15625, 'lr': 0.00026656103465128847, 'epoch': 94, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.015625 2.0558760166168213
1 0.171875 2.0891273021698
2 0.390625 1.9861053228378296
3 0.0625 2.001437187194824
4 0.25 1.9042718410491943
5 0.09375 1.8943644762039185
6 0.40625 1.6965160369873047
7 0.4375 1.5352927446365356
8 0.25 1.6936166286468506
9 0.40625 1.6783761978149414
10 0.390625 1.524875521659851
11 0.375 1.6285145282745361
test 0 0.5 1.9211366176605225
test 1 0.609375 1.8073543310165405


Overall:  24%|▏| 96/400 [7:24:09<16:23:43, 194.16s/it, decoder_mask_ratio=0.75, epoch=95, lr=0.000266, 

{'train/loss': 3.56594730168581, 'train/recon_losses': nan, 'train/contrastive_losses': 3.56594730168581, 'train/num_steps': 3072, 'train/cos_sim_encoder_output': 0.9569875616580248, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9924566689878702, 'train/probe_losses': 1.8073644836743672, 'train/probe_accs': 0.2708333333333333, 'test/probe_losses': 1.8642454743385315, 'test/probe_accs': 0.5546875, 'lr': 0.0002657967647303907, 'epoch': 95, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.166985511779785
1 0.09375 2.148228645324707
2 0.53125 1.951758623123169
3 0.53125 1.7358766794204712
4 0.53125 1.5699583292007446
5 0.6875 1.1400463581085205
6 0.3125 1.8147106170654297
7 0.1875 2.186816692352295
8 0.1875 2.0600521564483643
9 0.125 2.1367735862731934
10 0.109375 2.161763906478882
11 0.09375 2.2201290130615234
test 0 0.46875 1.5033657550811768
test 1 0.609375 1.2963688373565674


Overall:  24%|▏| 97/400 [7:27:49<16:58:55, 201.77s/it, decoder_mask_ratio=0.75, epoch=96, lr=0.000265, 

{'train/loss': 3.5343489572405815, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5343489572405815, 'train/num_steps': 3104, 'train/cos_sim_encoder_output': 0.9614454172551632, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9926191233098507, 'train/probe_losses': 1.9410916765530903, 'train/probe_accs': 0.2903645833333333, 'test/probe_losses': 1.399867296218872, 'test/probe_accs': 0.5390625, 'lr': 0.00026502498131270426, 'epoch': 96, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 1.9405012130737305
1 0.1875 1.9809999465942383
2 0.09375 1.9588862657546997
3 0.40625 1.7353075742721558
4 0.46875 1.4979565143585205
5 0.84375 1.2591662406921387
6 1.0 1.017226219177246
7 0.5 1.2123664617538452
8 0.3125 1.5709125995635986
9 0.375 1.2781200408935547
10 0.21875 1.7228275537490845
11 0.34375 1.6487897634506226
test 0 0.375 2.3160297870635986
test 1 0.375 2.2248127460479736


Overall:  24%|▏| 98/400 [7:30:24<15:45:02, 187.76s/it, decoder_mask_ratio=0.75, epoch=97, lr=0.000264, 

{'train/loss': 3.4323439821600914, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4323439821600914, 'train/num_steps': 3136, 'train/cos_sim_encoder_output': 0.9359681867063046, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9923491030931473, 'train/probe_losses': 1.5685883661111195, 'train/probe_accs': 0.4166666666666667, 'test/probe_losses': 2.270421266555786, 'test/probe_accs': 0.375, 'lr': 0.00026424573447814906, 'epoch': 97, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.09375 2.2237226963043213
1 0.15625 2.133955478668213
2 0.125 2.011756181716919
3 0.1875 1.89947509765625
4 0.40625 1.6670331954956055
5 0.46875 1.6005598306655884
6 0.25 1.9336084127426147
7 0.625 1.5443848371505737
8 0.625 1.4885441064834595
9 0.90625 1.1184659004211426
10 0.875 1.166746973991394
11 0.8125 1.1924850940704346
test 0 0.6875 1.5371476411819458
test 1 0.28125 2.2502617835998535


Overall:  25%|▏| 99/400 [7:34:51<17:41:09, 211.53s/it, decoder_mask_ratio=0.75, epoch=98, lr=0.000263, 

{'train/loss': 3.473110646009445, 'train/recon_losses': nan, 'train/contrastive_losses': 3.473110646009445, 'train/num_steps': 3168, 'train/cos_sim_encoder_output': 0.9537753034383059, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9919891189783812, 'train/probe_losses': 1.665061483780543, 'train/probe_accs': 0.4609375, 'test/probe_losses': 1.8937047123908997, 'test/probe_accs': 0.484375, 'lr': 0.00026345907479093545, 'epoch': 98, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.21875 2.089322566986084
1 0.3125 2.0323102474212646
2 0.1875 1.960218071937561
3 0.25 1.8320256471633911
4 0.421875 1.679997444152832
5 0.3125 1.4975107908248901
6 0.78125 1.4672003984451294
7 0.703125 1.1536283493041992
8 0.40625 1.6798057556152344
9 0.375 1.6425766944885254
10 0.28125 1.6808816194534302
11 0.15625 1.9731839895248413
test 0 0.125 2.373821973800659
test 1 0.0 2.380122184753418


Overall:  25%|▎| 100/400 [7:39:58<20:01:08, 240.23s/it, decoder_mask_ratio=0.75, epoch=99, lr=0.000263,

{'train/loss': 3.369601257145405, 'train/recon_losses': nan, 'train/contrastive_losses': 3.369601257145405, 'train/num_steps': 3200, 'train/cos_sim_encoder_output': 0.9501009657979012, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9935447052121162, 'train/probe_losses': 1.7240551312764485, 'train/probe_accs': 0.3671875, 'test/probe_losses': 2.3769720792770386, 'test/probe_accs': 0.0625, 'lr': 0.00026266505329628333, 'epoch': 99, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 2.279782295227051
1 0.59375 1.9717624187469482
2 0.40625 2.0068130493164062
3 0.28125 1.8070591688156128
4 0.4375 1.3886897563934326
5 0.5625 1.4801077842712402
6 0.34375 1.671001672744751
7 0.5 1.4160226583480835
8 0.40625 1.7693463563919067
9 0.28125 2.024211883544922
10 0.453125 1.7194291353225708
11 0.78125 1.252458930015564
test 0 0.0625 2.2573916912078857
test 1 0.0 2.677495241165161


Overall:  25%|▎| 101/400 [7:44:28<20:41:39, 249.16s/it, decoder_mask_ratio=0.75, epoch=100, lr=0.000262

{'train/loss': 3.408740796148777, 'train/recon_losses': nan, 'train/contrastive_losses': 3.408740796148777, 'train/num_steps': 3232, 'train/cos_sim_encoder_output': 0.9546539299190044, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9944881405681372, 'train/probe_losses': 1.7322237590948741, 'train/probe_accs': 0.4466145833333333, 'test/probe_losses': 2.4674434661865234, 'test/probe_accs': 0.03125, 'lr': 0.0002618637215171095, 'epoch': 100, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.3646271228790283
1 0.0 2.0105230808258057
2 0.3125 1.8768935203552246
3 0.5625 1.7449702024459839
4 0.375 1.7896497249603271
5 0.21875 1.991927981376648
6 0.5625 1.4627741575241089
7 0.21875 1.8007569313049316
8 0.25 1.8508152961730957
9 0.28125 2.0713610649108887
10 0.0625 2.2789976596832275
11 0.1875 1.8531394004821777
test 0 0.3125 2.382096529006958
test 1 0.40625 1.8500388860702515


Overall:  26%|▎| 102/400 [7:47:26<18:51:06, 227.74s/it, decoder_mask_ratio=0.75, epoch=101, lr=0.000261

{'train/loss': 3.384397730231285, 'train/recon_losses': nan, 'train/contrastive_losses': 3.384397730231285, 'train/num_steps': 3264, 'train/cos_sim_encoder_output': 0.9453308694064617, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9926619082689285, 'train/probe_losses': 1.9247030119101207, 'train/probe_accs': 0.2526041666666667, 'test/probe_losses': 2.1160677075386047, 'test/probe_accs': 0.359375, 'lr': 0.00026105513145068476, 'epoch': 101, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.03125 2.022113800048828
1 0.0625 1.9795985221862793
2 0.734375 1.8202754259109497
3 0.25 1.6945686340332031
4 0.1875 1.5434386730194092
5 0.78125 1.2832653522491455
6 0.765625 1.2684868574142456
7 0.75 1.2569398880004883
8 0.71875 1.4235392808914185
9 0.65625 1.6501641273498535
10 0.375 2.187899351119995
11 0.59375 1.6265209913253784
test 0 0.234375 2.5391414165496826
test 1 0.296875 2.1372275352478027


Overall:  26%|▎| 103/400 [7:50:00<16:57:16, 205.51s/it, decoder_mask_ratio=0.75, epoch=102, lr=0.00026,

{'train/loss': 3.3519481271505356, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3519481271505356, 'train/num_steps': 3296, 'train/cos_sim_encoder_output': 0.9435634706169367, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9912306442856789, 'train/probe_losses': 1.6464009086290996, 'train/probe_accs': 0.4921875, 'test/probe_losses': 2.3381844758987427, 'test/probe_accs': 0.265625, 'lr': 0.0002602393355652598, 'epoch': 102, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.1875 2.103930711746216
1 0.25 2.0300211906433105
2 0.09375 1.9885636568069458
3 0.5625 1.628939151763916
4 0.3125 1.7780468463897705
5 0.6875 1.40422523021698
6 0.71875 1.217556357383728
7 0.4375 1.671470046043396
8 0.5 1.8031760454177856
9 0.4375 1.9425758123397827
10 0.125 2.629577875137329
11 0.0625 2.640545606613159
test 0 0.09375 2.367156982421875
test 1 0.25 2.018862009048462


Overall:  26%|▎| 104/400 [7:52:35<15:39:05, 190.36s/it, decoder_mask_ratio=0.75, epoch=103, lr=0.000259

{'train/loss': 3.5043044313788414, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5043044313788414, 'train/num_steps': 3328, 'train/cos_sim_encoder_output': 0.9578559380024672, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9933178368955851, 'train/probe_losses': 1.9032190442085266, 'train/probe_accs': 0.3645833333333333, 'test/probe_losses': 2.1930094957351685, 'test/probe_accs': 0.171875, 'lr': 0.00025941638679666056, 'epoch': 103, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 1.9705865383148193
1 0.125 2.028428554534912
2 0.125 2.14597225189209
3 0.375 1.990180253982544
4 0.359375 1.7157875299453735
5 0.65625 1.6550281047821045
6 0.65625 1.6354061365127563
7 0.46875 1.621544599533081
8 0.34375 1.7620937824249268
9 0.15625 1.9407997131347656
10 0.21875 1.7676903009414673
11 0.25 1.7738004922866821
test 0 0.25 1.8985642194747925
test 1 0.21875 1.9832500219345093


Overall:  26%|▎| 105/400 [7:57:21<17:57:44, 219.20s/it, decoder_mask_ratio=0.75, epoch=104, lr=0.000259

{'train/loss': 3.40074709802866, 'train/recon_losses': nan, 'train/contrastive_losses': 3.40074709802866, 'train/num_steps': 3360, 'train/cos_sim_encoder_output': 0.9556003119796515, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9899186939001083, 'train/probe_losses': 1.8339431881904602, 'train/probe_accs': 0.3372395833333333, 'test/probe_losses': 1.9409071207046509, 'test/probe_accs': 0.234375, 'lr': 0.0002585863385448533, 'epoch': 104, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 1.9375059604644775
1 0.09375 1.9430229663848877
2 0.21875 1.8399981260299683
3 0.5625 1.6809509992599487
4 0.34375 1.828930139541626
5 0.28125 1.514792799949646
6 0.84375 1.3428112268447876
7 0.5 1.814707636833191
8 0.6875 1.4253746271133423
9 0.46875 1.7153633832931519
10 0.40625 1.742495059967041
11 0.25 2.0922274589538574
test 0 0.25 2.1957263946533203
test 1 0.40625 1.9313077926635742


Overall:  26%|▎| 106/400 [8:00:12<16:42:41, 204.63s/it, decoder_mask_ratio=0.75, epoch=105, lr=0.000258

{'train/loss': 3.3837085738778114, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3837085738778114, 'train/num_steps': 3392, 'train/cos_sim_encoder_output': 0.9554632846266031, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9933966808021069, 'train/probe_losses': 1.739848365386327, 'train/probe_accs': 0.4140625, 'test/probe_losses': 2.0635170936584473, 'test/probe_accs': 0.328125, 'lr': 0.00025774924467047945, 'epoch': 105, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1245269775390625
1 0.0 2.15678334236145
2 0.1875 2.0280725955963135
3 0.34375 1.9604765176773071
4 0.53125 1.9707059860229492
5 0.5 1.813184142112732
6 0.28125 1.8106400966644287
7 0.25 2.087106466293335
8 0.28125 1.7584164142608643
9 0.375 1.6461422443389893
10 0.375 1.5749331712722778
11 0.40625 1.5407438278198242
test 0 0.640625 1.2944086790084839
test 1 0.5 1.740355134010315


Overall:  27%|▎| 107/400 [8:03:13<16:04:34, 197.52s/it, decoder_mask_ratio=0.75, epoch=106, lr=0.000257

{'train/loss': 3.4357046112418175, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4357046112418175, 'train/num_steps': 3424, 'train/cos_sim_encoder_output': 0.9577332194894552, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9931612182408571, 'train/probe_losses': 1.8726443151632945, 'train/probe_accs': 0.2942708333333333, 'test/probe_losses': 1.5173819065093994, 'test/probe_accs': 0.5703125, 'lr': 0.00025690515949136107, 'epoch': 106, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.15625 2.345799684524536
1 0.125 2.2344374656677246
2 0.1875 1.973386287689209
3 0.53125 1.790635347366333
4 0.65625 1.4618253707885742
5 0.4375 1.569928765296936
6 0.34375 1.4534906148910522
7 0.5 1.5411529541015625
8 0.5 1.5642726421356201
9 0.65625 1.4238375425338745
10 0.59375 1.3402270078659058
11 0.59375 1.3877736330032349
test 0 0.375 1.6837801933288574
test 1 0.40625 1.8293519020080566


Overall:  27%|▎| 108/400 [8:05:50<15:03:00, 185.55s/it, decoder_mask_ratio=0.75, epoch=107, lr=0.000256

{'train/loss': 3.474063627421856, 'train/recon_losses': nan, 'train/contrastive_losses': 3.474063627421856, 'train/num_steps': 3456, 'train/cos_sim_encoder_output': 0.950796652585268, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9951607659459114, 'train/probe_losses': 1.673897276322047, 'train/probe_accs': 0.4401041666666667, 'test/probe_losses': 1.756566047668457, 'test/probe_accs': 0.390625, 'lr': 0.00025605413777897575, 'epoch': 107, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 1.8186039924621582
1 0.71875 1.7293245792388916
2 0.4375 1.7436285018920898
3 0.5625 1.7113721370697021
4 0.53125 1.6909841299057007
5 0.34375 1.996100902557373
6 0.5625 1.4911236763000488
7 0.375 1.7138011455535889
8 0.3125 1.6462186574935913
9 0.25 1.8270596265792847
10 0.328125 1.8788524866104126
11 0.21875 1.8505780696868896
test 0 0.125 2.4407267570495605
test 1 0.21875 2.2794690132141113


Overall:  27%|▎| 109/400 [8:08:26<14:17:10, 176.74s/it, decoder_mask_ratio=0.75, epoch=108, lr=0.000255

{'train/loss': 3.286380670964718, 'train/recon_losses': nan, 'train/contrastive_losses': 3.286380670964718, 'train/num_steps': 3488, 'train/cos_sim_encoder_output': 0.9396023042500019, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9940354563295841, 'train/probe_losses': 1.758137325445811, 'train/probe_accs': 0.3919270833333333, 'test/probe_losses': 2.360097885131836, 'test/probe_accs': 0.171875, 'lr': 0.00025519623475490287, 'epoch': 108, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 1.9102989435195923
1 0.03125 2.0157792568206787
2 0.34375 1.9128730297088623
3 0.34375 1.739699363708496
4 0.15625 1.996622920036316
5 0.09375 2.289635419845581
6 0.09375 2.1019670963287354
7 0.09375 2.1981594562530518
8 0.25 2.021538257598877
9 0.09375 2.0269505977630615
10 0.21875 2.031449317932129
11 0.34375 1.7392505407333374
test 0 0.25 1.863077163696289
test 1 0.21875 1.877065896987915


Overall:  28%|▎| 110/400 [8:11:06<13:50:00, 171.73s/it, decoder_mask_ratio=0.75, epoch=109, lr=0.000254

{'train/loss': 3.397899128496647, 'train/recon_losses': nan, 'train/contrastive_losses': 3.397899128496647, 'train/num_steps': 3520, 'train/cos_sim_encoder_output': 0.9361991565674543, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9917813055217266, 'train/probe_losses': 1.9986853500207264, 'train/probe_accs': 0.18229166666666666, 'test/probe_losses': 1.870071530342102, 'test/probe_accs': 0.234375, 'lr': 0.00025433150608724015, 'epoch': 109, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 1.9370533227920532
1 0.15625 2.140540838241577
2 0.4375 2.008927822113037
3 0.4375 1.7355763912200928
4 0.34375 1.7243797779083252
5 0.25 1.8363710641860962
6 0.15625 1.804734706878662
7 0.40625 1.9986450672149658
8 0.3125 2.0722696781158447
9 0.09375 2.38980770111084
10 0.09375 2.4330759048461914
11 0.15625 2.2654480934143066
test 0 1.0 0.9310229420661926
test 1 0.6875 1.2200690507888794


Overall:  28%|▎| 111/400 [8:16:00<16:42:56, 208.22s/it, decoder_mask_ratio=0.75, epoch=110, lr=0.000253

{'train/loss': 3.7442485615611076, 'train/recon_losses': nan, 'train/contrastive_losses': 3.7442485615611076, 'train/num_steps': 3552, 'train/cos_sim_encoder_output': 0.9706528820097446, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9961789064109325, 'train/probe_losses': 2.028902530670166, 'train/probe_accs': 0.24739583333333334, 'test/probe_losses': 1.075545996427536, 'test/probe_accs': 0.84375, 'lr': 0.0002534600078869918, 'epoch': 110, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.34375 2.0769684314727783
1 0.15625 2.236509323120117
2 0.40625 1.925354242324829
3 0.21875 1.9010069370269775
4 0.125 1.6612759828567505
5 0.65625 1.4769083261489868
6 0.25 1.7310682535171509
7 0.25 1.7830482721328735
8 0.15625 2.1744956970214844
9 0.125 1.9165921211242676
10 0.0 2.0973145961761475
11 0.09375 1.9650299549102783
test 0 0.46875 1.7873990535736084
test 1 0.5 1.6811295747756958


Overall:  28%|▎| 112/400 [8:18:37<15:26:38, 193.05s/it, decoder_mask_ratio=0.75, epoch=111, lr=0.000253

{'train/loss': 3.47626506537199, 'train/recon_losses': nan, 'train/contrastive_losses': 3.47626506537199, 'train/num_steps': 3584, 'train/cos_sim_encoder_output': 0.948223939165473, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9922657385468483, 'train/probe_losses': 1.9121310114860535, 'train/probe_accs': 0.23177083333333334, 'test/probe_losses': 1.734264314174652, 'test/probe_accs': 0.484375, 'lr': 0.000252581796704427, 'epoch': 111, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.1875 2.0684921741485596
1 0.09375 2.0103635787963867
2 0.0625 2.035680055618286
3 0.28125 1.8092377185821533
4 0.09375 2.0377235412597656
5 0.46875 1.8491343259811401
6 0.21875 2.224663734436035
7 0.34375 1.6963855028152466
8 0.25 1.6943035125732422
9 0.15625 1.8042640686035156
10 0.1875 1.7448195219039917
11 0.03125 2.3220367431640625
test 0 0.125 2.3842761516571045
test 1 0.09375 2.2713372707366943


Overall:  28%|▎| 113/400 [8:21:16<14:33:43, 182.66s/it, decoder_mask_ratio=0.75, epoch=112, lr=0.000252

{'train/loss': 3.6347822099924088, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6347822099924088, 'train/num_steps': 3616, 'train/cos_sim_encoder_output': 0.9708108771592379, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9911400210112333, 'train/probe_losses': 1.9414253731568654, 'train/probe_accs': 0.19791666666666666, 'test/probe_losses': 2.3278067111968994, 'test/probe_accs': 0.109375, 'lr': 0.0002516969295254109, 'epoch': 112, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1625635623931885
1 0.21875 2.0402653217315674
2 0.234375 2.028092861175537
3 0.40625 1.897301197052002
4 0.375 1.9674146175384521
5 0.125 2.0757317543029785
6 0.15625 2.1037752628326416
7 0.421875 1.9644780158996582
8 0.25 1.9878205060958862
9 0.375 2.1062848567962646
10 0.8125 1.6101665496826172
11 0.71875 1.6781431436538696
test 0 0.375 1.853976845741272
test 1 0.25 1.8685851097106934


Overall:  28%|▎| 114/400 [8:23:51<13:50:39, 174.26s/it, decoder_mask_ratio=0.75, epoch=113, lr=0.000251

{'train/loss': 3.4780322909355164, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4780322909355164, 'train/num_steps': 3648, 'train/cos_sim_encoder_output': 0.9579113237559795, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9930795170366764, 'train/probe_losses': 1.9685031374295552, 'train/probe_accs': 0.3411458333333333, 'test/probe_losses': 1.8612809777259827, 'test/probe_accs': 0.3125, 'lr': 0.0002508054637677067, 'epoch': 113, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.8190908432006836
1 0.4375 1.894457221031189
2 0.1875 1.9077845811843872
3 0.09375 1.8199646472930908
4 0.40625 1.8469685316085815
5 0.40625 1.888940691947937
6 0.25 2.4324705600738525
7 0.21875 2.3734467029571533
8 0.34375 2.150794506072998
9 0.34375 2.436814546585083
10 0.578125 1.8343980312347412
11 0.671875 1.5922515392303467
test 0 0.0 2.3563225269317627
test 1 0.0625 2.435431480407715


Overall:  29%|▎| 115/400 [8:26:38<13:37:40, 172.14s/it, decoder_mask_ratio=0.75, epoch=114, lr=0.00025,

{'train/loss': 3.46972493827343, 'train/recon_losses': nan, 'train/contrastive_losses': 3.46972493827343, 'train/num_steps': 3680, 'train/cos_sim_encoder_output': 0.9573031421750784, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.991961844265461, 'train/probe_losses': 1.999781866868337, 'train/probe_accs': 0.328125, 'test/probe_losses': 2.3958770036697388, 'test/probe_accs': 0.03125, 'lr': 0.00024990745727724976, 'epoch': 114, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.3300304412841797
1 0.0625 2.2561521530151367
2 0.03125 2.249760627746582
3 0.0625 1.916448950767517
4 0.25 1.7255854606628418
5 0.34375 1.733125925064087
6 0.15625 1.9778709411621094
7 0.125 2.0905239582061768
8 0.25 2.0646426677703857
9 0.28125 1.9127933979034424
10 0.21875 2.3551552295684814
11 0.1875 2.1947388648986816
test 0 0.3125 1.7225216627120972
test 1 0.09375 2.0933806896209717


Overall:  29%|▎| 116/400 [8:29:18<13:17:41, 168.53s/it, decoder_mask_ratio=0.75, epoch=115, lr=0.000249

{'train/loss': 3.433428533375263, 'train/recon_losses': nan, 'train/contrastive_losses': 3.433428533375263, 'train/num_steps': 3712, 'train/cos_sim_encoder_output': 0.9585398733615875, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9953152798116207, 'train/probe_losses': 2.0672357181708017, 'train/probe_accs': 0.16927083333333334, 'test/probe_losses': 1.9079511761665344, 'test/probe_accs': 0.203125, 'lr': 0.0002490029683243946, 'epoch': 115, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0333118438720703
1 0.0 2.036672592163086
2 0.0 1.9623433351516724
3 0.3125 2.0355138778686523
4 0.375 1.8962769508361816
5 0.171875 2.1362624168395996
6 0.125 2.2144556045532227
7 0.28125 1.943397879600525
8 0.3125 1.9237467050552368
9 0.28125 2.058098554611206
10 0.140625 1.9190666675567627
11 0.0625 2.12060546875
test 0 0.03125 1.9540901184082031
test 1 0.09375 2.019413709640503


Overall:  29%|▎| 117/400 [8:33:12<14:47:50, 188.23s/it, decoder_mask_ratio=0.75, epoch=116, lr=0.000248

{'train/loss': 3.4408927112817764, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4408927112817764, 'train/num_steps': 3744, 'train/cos_sim_encoder_output': 0.9590066969394684, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9966291896998882, 'train/probe_losses': 2.023312658071518, 'train/probe_accs': 0.171875, 'test/probe_losses': 1.986751914024353, 'test/probe_accs': 0.0625, 'lr': 0.000248092055600133, 'epoch': 116, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.34375 2.098475217819214
1 0.1875 2.0123023986816406
2 0.125 2.167653799057007
3 0.6875 1.7490158081054688
4 0.453125 1.7263537645339966
5 0.34375 1.8338390588760376
6 0.28125 1.983236312866211
7 0.21875 2.236013412475586
8 0.25 1.9293818473815918
9 0.125 2.0508220195770264
10 0.21875 1.909584879875183
11 0.15625 1.9165936708450317
test 0 0.625 1.7120308876037598
test 1 0.4375 1.8349761962890625


Overall:  30%|▎| 118/400 [8:36:33<15:03:11, 192.17s/it, decoder_mask_ratio=0.75, epoch=117, lr=0.000247

{'train/loss': 3.582152985036373, 'train/recon_losses': nan, 'train/contrastive_losses': 3.582152985036373, 'train/num_steps': 3776, 'train/cos_sim_encoder_output': 0.9649173934012651, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9969731420278549, 'train/probe_losses': 1.9677726825078328, 'train/probe_accs': 0.2825520833333333, 'test/probe_losses': 1.7735035419464111, 'test/probe_accs': 0.53125, 'lr': 0.00024717477821228624, 'epoch': 117, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 2.048030376434326
1 0.3125 2.085143804550171
2 0.0625 2.1770260334014893
3 0.34375 1.9886868000030518
4 0.5 1.6570554971694946
5 0.15625 1.6711276769638062
6 0.703125 1.528507113456726
7 0.84375 1.3416048288345337
8 0.40625 1.820585012435913
9 0.375 1.8588082790374756
10 0.609375 1.7990070581436157
11 0.25 2.512303113937378
test 0 0.296875 2.296733856201172
test 1 0.28125 2.5001771450042725


Overall:  30%|▎| 119/400 [8:39:14<14:15:58, 182.77s/it, decoder_mask_ratio=0.75, epoch=118, lr=0.000246

{'train/loss': 3.4828794598579407, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4828794598579407, 'train/num_steps': 3808, 'train/cos_sim_encoder_output': 0.9488490428775549, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9963834248483181, 'train/probe_losses': 1.8739904661973317, 'train/probe_accs': 0.4010416666666667, 'test/probe_losses': 2.398455500602722, 'test/probe_accs': 0.2890625, 'lr': 0.0002462511956816696, 'epoch': 118, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 2.019988536834717
1 0.28125 1.9747846126556396
2 0.15625 1.9447853565216064
3 0.125 1.9735654592514038
4 0.40625 1.8678159713745117
5 0.34375 1.7560819387435913
6 0.75 2.0530831813812256
7 0.75 1.8401763439178467
8 0.6875 1.8441603183746338
9 0.6875 1.8455597162246704
10 0.578125 1.6252926588058472
11 0.609375 1.690195083618164
test 0 0.609375 1.5465219020843506
test 1 0.6875 1.4827865362167358


Overall:  30%|▎| 120/400 [8:42:42<14:47:17, 190.14s/it, decoder_mask_ratio=0.75, epoch=119, lr=0.000245

{'train/loss': 3.5126193314790726, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5126193314790726, 'train/num_steps': 3840, 'train/cos_sim_encoder_output': 0.9643173795193434, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9964157622307539, 'train/probe_losses': 1.869624098141988, 'train/probe_accs': 0.4739583333333333, 'test/probe_losses': 1.5146542191505432, 'test/probe_accs': 0.6484375, 'lr': 0.00024532136793823, 'epoch': 119, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0546810626983643
1 0.046875 2.025327682495117
2 0.34375 1.869816780090332
3 0.28125 1.655164361000061
4 0.140625 1.6198740005493164
5 0.40625 1.716468095779419
6 0.625 1.4155220985412598
7 0.375 1.7868165969848633
8 0.28125 2.219393491744995
9 0.15625 2.218029499053955
10 0.0625 2.7352192401885986
11 0.09375 2.508512020111084
test 0 0.0 2.1095094680786133
test 1 0.09375 2.228302240371704


Overall:  30%|▎| 121/400 [8:45:36<14:21:56, 185.36s/it, decoder_mask_ratio=0.75, epoch=120, lr=0.000244

{'train/loss': 3.4966417476534843, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4966417476534843, 'train/num_steps': 3872, 'train/cos_sim_encoder_output': 0.954901397228241, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9943596310913563, 'train/probe_losses': 1.9854020774364471, 'train/probe_accs': 0.234375, 'test/probe_losses': 2.1689058542251587, 'test/probe_accs': 0.046875, 'lr': 0.00024438535531715706, 'epoch': 120, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.1117970943450928
1 0.09375 2.090096950531006
2 0.0625 1.90617835521698
3 0.5625 1.7625443935394287
4 0.3125 1.870396614074707
5 0.28125 1.858125925064087
6 0.125 1.974273443222046
7 0.09375 1.882108211517334
8 0.1875 1.4996650218963623
9 0.34375 1.5706075429916382
10 0.21875 1.6551955938339233
11 0.46875 1.3212295770645142
test 0 0.0 2.7259035110473633
test 1 0.0 2.4790565967559814


Overall:  30%|▎| 122/400 [8:48:24<13:54:59, 180.21s/it, decoder_mask_ratio=0.75, epoch=121, lr=0.000243

{'train/loss': 3.506800852715969, 'train/recon_losses': nan, 'train/contrastive_losses': 3.506800852715969, 'train/num_steps': 3904, 'train/cos_sim_encoder_output': 0.9618254769593477, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9942989591509104, 'train/probe_losses': 1.79185156027476, 'train/probe_accs': 0.234375, 'test/probe_losses': 2.6024800539016724, 'test/probe_accs': 0.0, 'lr': 0.00024344321855496848, 'epoch': 121, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.0290284156799316
1 0.1875 2.039663314819336
2 0.125 2.015840530395508
3 0.53125 1.755316138267517
4 0.4375 1.6807758808135986
5 0.375 1.5824297666549683
6 0.5 1.484317421913147
7 0.5625 1.512168288230896
8 0.6875 1.3862535953521729
9 0.625 1.3469280004501343
10 0.875 1.036818265914917
11 0.6875 1.4310804605484009
test 0 0.5 1.759941577911377
test 1 0.59375 1.6261396408081055


Overall:  31%|▎| 123/400 [8:51:04<13:24:38, 174.29s/it, decoder_mask_ratio=0.75, epoch=122, lr=0.000242

{'train/loss': 3.485084131360054, 'train/recon_losses': nan, 'train/contrastive_losses': 3.485084131360054, 'train/num_steps': 3936, 'train/cos_sim_encoder_output': 0.9455736763775349, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9928094316273928, 'train/probe_losses': 1.6083850065867107, 'train/probe_accs': 0.4895833333333333, 'test/probe_losses': 1.6930406093597412, 'test/probe_accs': 0.546875, 'lr': 0.0002424950187855686, 'epoch': 122, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.212050676345825
1 0.0 2.158480405807495
2 0.53125 1.9928600788116455
3 0.5 1.738246202468872
4 0.40625 1.700421929359436
5 0.125 2.0071144104003906
6 0.0625 2.185969114303589
7 0.125 2.3430333137512207
8 0.03125 2.034067392349243
9 0.21875 1.829155683517456
10 0.40625 1.533238410949707
11 0.375 1.8049039840698242
test 0 0.21875 2.12717342376709
test 1 0.21875 2.0929455757141113


Overall:  31%|▎| 124/400 [8:53:58<13:21:11, 174.17s/it, decoder_mask_ratio=0.75, epoch=123, lr=0.000242

{'train/loss': 3.461258389055729, 'train/recon_losses': nan, 'train/contrastive_losses': 3.461258389055729, 'train/num_steps': 3968, 'train/cos_sim_encoder_output': 0.9595713131129742, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9937496948987246, 'train/probe_losses': 1.9616284668445587, 'train/probe_accs': 0.23177083333333334, 'test/probe_losses': 2.1100594997406006, 'test/probe_accs': 0.21875, 'lr': 0.00024154081753628167, 'epoch': 123, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.6434483528137207
1 0.0 2.752025842666626
2 0.09375 2.3513996601104736
3 0.125 2.1225297451019287
4 0.46875 1.9433139562606812
5 0.75 1.5033584833145142
6 0.5625 1.5067648887634277
7 0.78125 1.1368701457977295
8 0.40625 1.5873810052871704
9 0.375 1.6631845235824585
10 0.40625 1.6033052206039429
11 0.21875 1.7614110708236694
test 0 0.25 2.153635025024414
test 1 0.125 2.415585517883301


Overall:  31%|▎| 125/400 [8:57:14<13:47:19, 180.51s/it, decoder_mask_ratio=0.75, epoch=124, lr=0.000241

{'train/loss': 3.465439185500145, 'train/recon_losses': nan, 'train/contrastive_losses': 3.465439185500145, 'train/num_steps': 4000, 'train/cos_sim_encoder_output': 0.9515623636543751, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9945892971009016, 'train/probe_losses': 1.8812494079271953, 'train/probe_accs': 0.3489583333333333, 'test/probe_losses': 2.2846102714538574, 'test/probe_accs': 0.1875, 'lr': 0.00024058067672385933, 'epoch': 124, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0222856998443604
1 0.0 2.1399378776550293
2 0.09375 2.2154319286346436
3 0.28125 1.9849857091903687
4 0.5625 1.695001244544983
5 0.65625 1.3887757062911987
6 0.5625 1.4843828678131104
7 0.3125 1.7545839548110962
8 0.09375 1.9760291576385498
9 0.34375 1.490675926208496
10 0.125 1.7742254734039307
11 0.125 1.6580357551574707
test 0 0.28125 2.0116989612579346
test 1 0.28125 2.0621864795684814


Overall:  32%|▎| 126/400 [9:00:07<13:34:27, 178.35s/it, decoder_mask_ratio=0.75, epoch=125, lr=0.00024,

{'train/loss': 3.5680763497948647, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5680763497948647, 'train/num_steps': 4032, 'train/cos_sim_encoder_output': 0.9591175597161055, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9929262865334749, 'train/probe_losses': 1.798695941766103, 'train/probe_accs': 0.2630208333333333, 'test/probe_losses': 2.036942720413208, 'test/probe_accs': 0.28125, 'lr': 0.00023961465865046295, 'epoch': 125, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 1.9268643856048584
1 0.34375 1.8153414726257324
2 0.25 1.7969967126846313
3 0.1875 1.7851759195327759
4 0.65625 1.5043096542358398
5 0.40625 1.6434484720230103
6 0.53125 1.760878562927246
7 0.15625 2.633638381958008
8 0.09375 2.877612590789795
9 0.1875 2.720806360244751
10 0.15625 2.7067244052886963
11 0.0625 2.5149455070495605
test 0 0.0 2.4894521236419678
test 1 0.0 2.5667340755462646


Overall:  32%|▎| 127/400 [9:02:52<13:13:04, 174.30s/it, decoder_mask_ratio=0.75, epoch=126, lr=0.000239

{'train/loss': 3.543552629649639, 'train/recon_losses': nan, 'train/contrastive_losses': 3.543552629649639, 'train/num_steps': 4064, 'train/cos_sim_encoder_output': 0.9624561425298452, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9919524621218443, 'train/probe_losses': 2.1405618687470755, 'train/probe_accs': 0.2786458333333333, 'test/probe_losses': 2.528093099594116, 'test/probe_accs': 0.0, 'lr': 0.00023864282599962092, 'epoch': 126, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.224301815032959
1 0.0 2.080064535140991
2 0.59375 1.8921290636062622
3 0.25 1.8661383390426636
4 0.125 1.9740492105484009
5 0.65625 1.6416735649108887
6 0.6875 1.2609376907348633
7 0.5625 1.4735552072525024
8 0.4375 1.7590829133987427
9 0.4375 1.8326411247253418
10 0.53125 1.5620827674865723
11 0.4375 1.5260215997695923
test 0 0.125 2.1770596504211426
test 1 0.0625 2.549248218536377


Overall:  32%|▎| 128/400 [9:05:28<12:45:39, 168.90s/it, decoder_mask_ratio=0.75, epoch=127, lr=0.000238

{'train/loss': 3.5020972937345505, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5020972937345505, 'train/num_steps': 4096, 'train/cos_sim_encoder_output': 0.9630665201693773, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9945941735059023, 'train/probe_losses': 1.7577231526374817, 'train/probe_accs': 0.3932291666666667, 'test/probe_losses': 2.3631539344787598, 'test/probe_accs': 0.09375, 'lr': 0.00023766524183216133, 'epoch': 127, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 2.0342018604278564
1 0.1875 2.0148069858551025
2 0.3125 1.9748342037200928
3 0.15625 1.8917018175125122
4 0.5 1.8434360027313232
5 0.28125 1.8849520683288574
6 0.125 2.245371103286743
7 0.25 1.7393497228622437
8 0.46875 1.6458793878555298
9 0.3125 2.012549877166748
10 0.34375 1.9217718839645386
11 0.21875 2.1517982482910156
test 0 0.265625 2.026670455932617
test 1 0.28125 2.013090133666992


Overall:  32%|▎| 129/400 [9:08:06<12:27:45, 165.55s/it, decoder_mask_ratio=0.75, epoch=128, lr=0.000237

{'train/loss': 3.432722419500351, 'train/recon_losses': nan, 'train/contrastive_losses': 3.432722419500351, 'train/num_steps': 4128, 'train/cos_sim_encoder_output': 0.9584946185350418, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9924963321536779, 'train/probe_losses': 1.9467210968335469, 'train/probe_accs': 0.2838541666666667, 'test/probe_losses': 2.0198802947998047, 'test/probe_accs': 0.2734375, 'lr': 0.0002366819695821198, 'epoch': 128, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.1301863193511963
1 0.0625 2.146362066268921
2 0.0 1.9709112644195557
3 0.5 1.9473739862442017
4 0.25 2.0825459957122803
5 0.34375 1.8197364807128906
6 0.25 1.8265292644500732
7 0.28125 1.6702427864074707
8 0.03125 1.564217448234558
9 0.4375 1.553915023803711
10 0.1875 1.7649625539779663
11 0.3125 1.862365484237671
test 0 0.09375 2.496572971343994
test 1 0.203125 2.3282487392425537


Overall:  32%|▎| 130/400 [9:10:43<12:14:12, 163.16s/it, decoder_mask_ratio=0.75, epoch=129, lr=0.000236

{'train/loss': 3.4191486164927483, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4191486164927483, 'train/num_steps': 4160, 'train/cos_sim_encoder_output': 0.954878618940711, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9903247393667698, 'train/probe_losses': 1.8616123894850414, 'train/probe_accs': 0.2265625, 'test/probe_losses': 2.412410855293274, 'test/probe_accs': 0.1484375, 'lr': 0.00023569307305262358, 'epoch': 129, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.46875 1.881252646446228
1 0.6875 1.815955638885498
2 0.53125 1.8062629699707031
3 0.34375 1.8962266445159912
4 0.46875 1.5695229768753052
5 0.46875 1.4260889291763306
6 0.34375 1.7960546016693115
7 0.1875 1.9592303037643433
8 0.0625 1.7448774576187134
9 0.25 1.8635988235473633
10 0.15625 1.9851411581039429
11 0.1875 1.859094262123108
test 0 0.21875 2.0686209201812744
test 1 0.375 2.066359519958496


Overall:  33%|▎| 131/400 [9:16:27<16:14:01, 217.26s/it, decoder_mask_ratio=0.75, epoch=130, lr=0.000235

{'train/loss': 3.371413841843605, 'train/recon_losses': nan, 'train/contrastive_losses': 3.371413841843605, 'train/num_steps': 4192, 'train/cos_sim_encoder_output': 0.957195470109582, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9926927667111158, 'train/probe_losses': 1.8002755343914032, 'train/probe_accs': 0.3463541666666667, 'test/probe_losses': 2.0674902200698853, 'test/probe_accs': 0.296875, 'lr': 0.00023469861641175138, 'epoch': 130, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 1.9918795824050903
1 0.15625 2.0426552295684814
2 0.15625 2.0213780403137207
3 0.34375 1.9652327299118042
4 0.53125 1.6278246641159058
5 0.1875 1.7551822662353516
6 0.296875 1.8638274669647217
7 0.28125 1.9522502422332764
8 0.15625 2.0761053562164307
9 0.125 2.1238555908203125
10 0.140625 2.1438100337982178
11 0.015625 1.8338027000427246
test 0 0.28125 1.8772958517074585
test 1 0.34375 2.0399906635284424


Overall:  33%|▎| 132/400 [9:19:24<15:17:04, 205.31s/it, decoder_mask_ratio=0.75, epoch=131, lr=0.000234

{'train/loss': 3.3441584706306458, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3441584706306458, 'train/num_steps': 4224, 'train/cos_sim_encoder_output': 0.9470435678958893, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9945985153317451, 'train/probe_losses': 1.949816991885503, 'train/probe_accs': 0.22005208333333334, 'test/probe_losses': 1.9586432576179504, 'test/probe_accs': 0.3125, 'lr': 0.00023369866418836946, 'epoch': 131, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.151718854904175
1 0.0 2.058908700942993
2 0.21875 2.0739433765411377
3 0.25 2.0234060287475586
4 0.46875 1.655704379081726
5 0.6875 1.5341899394989014
6 0.5625 1.4755719900131226
7 0.375 1.561315655708313
8 0.28125 1.8690006732940674
9 0.4375 1.7845265865325928
10 0.40625 2.003350257873535
11 0.40625 1.6957799196243286
test 0 0.15625 2.3149073123931885
test 1 0.09375 2.4371180534362793


Overall:  33%|▎| 133/400 [9:22:37<14:56:34, 201.48s/it, decoder_mask_ratio=0.75, epoch=132, lr=0.000233

{'train/loss': 3.426267944276333, 'train/recon_losses': nan, 'train/contrastive_losses': 3.426267944276333, 'train/num_steps': 4256, 'train/cos_sim_encoder_output': 0.9596332460641861, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9915635883808136, 'train/probe_losses': 1.8239513635635376, 'train/probe_accs': 0.3411458333333333, 'test/probe_losses': 2.376012682914734, 'test/probe_accs': 0.125, 'lr': 0.0002326932812679446, 'epoch': 132, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.4735567569732666
1 0.0 2.4153640270233154
2 0.59375 2.1389319896698
3 0.65625 1.63010835647583
4 0.53125 1.5178767442703247
5 0.25 1.8986284732818604
6 0.40625 1.6612930297851562
7 0.25 2.039562463760376
8 0.125 2.224867582321167
9 0.21875 1.9493162631988525
10 0.03125 2.553997039794922
11 0.109375 2.1811118125915527
test 0 0.46875 1.85469388961792
test 1 0.34375 2.1266307830810547


Overall:  34%|▎| 134/400 [9:25:14<13:54:00, 188.12s/it, decoder_mask_ratio=0.75, epoch=133, lr=0.000232

{'train/loss': 3.4625374153256416, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4625374153256416, 'train/num_steps': 4288, 'train/cos_sim_encoder_output': 0.9641253426671028, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9927356019616127, 'train/probe_losses': 2.0570512115955353, 'train/probe_accs': 0.2643229166666667, 'test/probe_losses': 1.9906623363494873, 'test/probe_accs': 0.40625, 'lr': 0.00023168253288833378, 'epoch': 133, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.1441233158111572
1 0.0 2.098618984222412
2 0.3125 1.9762474298477173
3 0.4375 1.8894951343536377
4 0.375 1.7166998386383057
5 0.59375 1.4585416316986084
6 0.78125 1.3169978857040405
7 0.84375 1.201811671257019
8 0.4375 1.4554328918457031
9 0.53125 1.3142917156219482
10 0.53125 1.3394443988800049
11 0.46875 1.4933644533157349
test 0 0.25 1.6887311935424805
test 1 0.25 2.276890277862549


Overall:  34%|▎| 135/400 [9:28:41<14:15:56, 193.80s/it, decoder_mask_ratio=0.75, epoch=134, lr=0.000231

{'train/loss': 3.360833376646042, 'train/recon_losses': nan, 'train/contrastive_losses': 3.360833376646042, 'train/num_steps': 4320, 'train/cos_sim_encoder_output': 0.9622201193124056, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9907432403415442, 'train/probe_losses': 1.6170891125996907, 'train/probe_accs': 0.4427083333333333, 'test/probe_losses': 1.9828107357025146, 'test/probe_accs': 0.25, 'lr': 0.00023066648463555085, 'epoch': 134, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0014266967773438
1 0.0625 2.042985439300537
2 0.0 2.137540102005005
3 0.5 1.5861330032348633
4 0.40625 1.5051932334899902
5 0.28125 1.5137624740600586
6 0.125 1.6517231464385986
7 0.5 1.4549566507339478
8 0.21875 1.823225975036621
9 0.28125 1.67653489112854
10 0.421875 1.8858057260513306
11 0.28125 2.20743727684021
test 0 0.5 1.9433997869491577
test 1 0.53125 1.9024436473846436


Overall:  34%|▎| 136/400 [9:31:48<14:04:05, 191.84s/it, decoder_mask_ratio=0.75, epoch=135, lr=0.00023,

{'train/loss': 3.4521273151040077, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4521273151040077, 'train/num_steps': 4352, 'train/cos_sim_encoder_output': 0.9643146600574255, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9891411978751421, 'train/probe_losses': 1.7905603845914204, 'train/probe_accs': 0.2565104166666667, 'test/probe_losses': 1.9229217171669006, 'test/probe_accs': 0.515625, 'lr': 0.0002296452024395109, 'epoch': 135, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.28125 2.1030209064483643
1 0.4375 1.9627357721328735
2 0.4375 1.7664446830749512
3 0.21875 1.82566237449646
4 0.15625 1.7111740112304688
5 0.40625 1.9894119501113892
6 0.25 2.0703725814819336
7 0.15625 2.2679035663604736
8 0.15625 2.4965929985046387
9 0.125 2.3348748683929443
10 0.3125 2.011383056640625
11 0.3125 1.8706244230270386
test 0 0.46875 1.5105564594268799
test 1 0.53125 1.4161421060562134


Overall:  34%|▎| 137/400 [9:35:09<14:13:09, 194.64s/it, decoder_mask_ratio=0.75, epoch=136, lr=0.000229

{'train/loss': 3.6369739174842834, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6369739174842834, 'train/num_steps': 4384, 'train/cos_sim_encoder_output': 0.9709065835922956, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9930470939725637, 'train/probe_losses': 2.0341834326585135, 'train/probe_accs': 0.2708333333333333, 'test/probe_losses': 1.4633492827415466, 'test/probe_accs': 0.5, 'lr': 0.00022861875256975205, 'epoch': 136, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.03125 2.3883044719696045
1 0.15625 2.2624053955078125
2 0.03125 2.178379535675049
3 0.4375 1.9515132904052734
4 0.21875 2.126678943634033
5 0.265625 1.945925235748291
6 0.203125 1.9971200227737427
7 0.15625 2.161343812942505
8 0.09375 2.4448230266571045
9 0.296875 1.9916141033172607
10 0.1875 2.2164602279663086
11 0.125 2.297699451446533
test 0 0.28125 1.7639565467834473
test 1 0.375 1.7852579355239868


Overall:  34%|▎| 138/400 [9:37:46<13:19:54, 183.18s/it, decoder_mask_ratio=0.75, epoch=137, lr=0.000228

{'train/loss': 3.408852405846119, 'train/recon_losses': nan, 'train/contrastive_losses': 3.408852405846119, 'train/num_steps': 4416, 'train/cos_sim_encoder_output': 0.9566223938018084, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.99143947660923, 'train/probe_losses': 2.1635222931702933, 'train/probe_accs': 0.18359375, 'test/probe_losses': 1.774607241153717, 'test/probe_accs': 0.328125, 'lr': 0.00022758720163113547, 'epoch': 137, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.25 2.3399863243103027
1 0.15625 2.2965002059936523
2 0.0625 2.2694966793060303
3 0.1875 1.7804276943206787
4 0.1875 1.6903822422027588
5 0.8125 1.4427663087844849
6 0.40625 1.865465521812439
7 0.25 2.2005269527435303
8 0.21875 2.0716874599456787
9 0.21875 1.9956597089767456
10 0.09375 2.0560171604156494
11 0.125 1.8288729190826416
test 0 0.4375 1.3666311502456665
test 1 0.4375 1.389257788658142


Overall:  35%|▎| 139/400 [9:41:06<13:39:46, 188.45s/it, decoder_mask_ratio=0.75, epoch=138, lr=0.000227

{'train/loss': 3.436442092061043, 'train/recon_losses': nan, 'train/contrastive_losses': 3.436442092061043, 'train/num_steps': 4448, 'train/cos_sim_encoder_output': 0.9560029115527868, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9918579049408436, 'train/probe_losses': 1.986482431491216, 'train/probe_accs': 0.24739583333333334, 'test/probe_losses': 1.3779444694519043, 'test/probe_accs': 0.4375, 'lr': 0.00022655061655952317, 'epoch': 138, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.6158454418182373
1 0.0 2.5216259956359863
2 0.0 2.5352509021759033
3 0.40625 2.0319571495056152
4 0.28125 1.8851639032363892
5 0.71875 1.5843268632888794
6 0.25 2.0653061866760254
7 0.28125 1.9538275003433228
8 0.1875 2.0859479904174805
9 0.125 2.288799285888672
10 0.125 2.2054154872894287
11 0.125 2.2238922119140625
test 0 0.21875 2.0977935791015625


Overall:  35%|▎| 139/400 [9:44:03<13:39:46, 188.45s/it, decoder_mask_ratio=0.75, epoch=139, lr=0.000226

test 1 0.125 2.250018358230591
{'train/loss': 3.5755914226174355, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5755914226174355, 'train/num_steps': 4480, 'train/cos_sim_encoder_output': 0.9614335503429174, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9957108106464148, 'train/probe_losses': 2.1664465765158334, 'train/probe_accs': 0.20833333333333334, 'test/probe_losses': 2.1739059686660767, 'test/probe_accs': 0.171875, 'lr': 0.00022550906461743506, 'epoch': 139, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}


Overall:  35%|▎| 140/400 [9:44:03<13:21:13, 184.90s/it, decoder_mask_ratio=0.75, epoch=139, lr=0.000226

0 0.0 2.3207178115844727
1 0.0 2.23828387260437
2 0.09375 2.146379232406616
3 0.65625 1.7958831787109375
4 0.53125 1.606729507446289
5 0.5 1.5194380283355713
6 0.578125 1.3348095417022705
7 0.4375 1.4345076084136963
8 0.125 1.7013849020004272
9 0.15625 1.6579500436782837
10 0.4375 1.678375482559204
11 0.515625 1.6574366092681885
test 0 0.15625 1.9722161293029785
test 1 0.3125 1.6898589134216309


Overall:  35%|▎| 141/400 [9:47:12<13:23:54, 186.23s/it, decoder_mask_ratio=0.75, epoch=140, lr=0.000224

{'train/loss': 3.453694872558117, 'train/recon_losses': nan, 'train/contrastive_losses': 3.453694872558117, 'train/num_steps': 4512, 'train/cos_sim_encoder_output': 0.9537229780107737, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.991924200206995, 'train/probe_losses': 1.7576579848925273, 'train/probe_accs': 0.3359375, 'test/probe_losses': 1.8310375213623047, 'test/probe_accs': 0.234375, 'lr': 0.00022446261338968405, 'epoch': 140, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.0367541313171387
1 0.09375 1.972287654876709
2 0.3125 1.953107476234436
3 0.25 1.753540277481079
4 0.625 1.4324495792388916
5 0.4375 1.6886261701583862
6 0.53125 1.6159563064575195
7 0.71875 1.259806752204895
8 0.78125 1.1444272994995117
9 0.5 1.9726707935333252
10 0.46875 2.022822856903076
11 0.40625 1.9221938848495483
test 0 0.3125 2.692013740539551
test 1 0.3125 2.611222267150879


Overall:  36%|▎| 142/400 [9:49:44<12:36:17, 175.88s/it, decoder_mask_ratio=0.75, epoch=141, lr=0.000223

{'train/loss': 3.5566689297556877, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5566689297556877, 'train/num_steps': 4544, 'train/cos_sim_encoder_output': 0.9643433075398207, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9955265149474144, 'train/probe_losses': 1.731220265229543, 'train/probe_accs': 0.4270833333333333, 'test/probe_losses': 2.651618003845215, 'test/probe_accs': 0.3125, 'lr': 0.00022341133077899062, 'epoch': 141, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.40625 2.12218976020813
1 0.6875 1.9565640687942505
2 0.46875 1.8601303100585938
3 0.28125 1.7673516273498535
4 0.21875 1.8829848766326904
5 0.25 1.7397462129592896
6 0.421875 1.8393884897232056
7 0.25 1.9188505411148071
8 0.28125 2.0866165161132812
9 0.03125 2.590585708618164
10 0.09375 2.5499918460845947
11 0.0625 2.41178297996521
test 0 0.3125 2.2316572666168213
test 1 0.25 2.121316432952881


Overall:  36%|▎| 143/400 [9:53:06<13:06:24, 183.60s/it, decoder_mask_ratio=0.75, epoch=142, lr=0.000222

{'train/loss': 3.6795705556869507, 'train/recon_losses': nan, 'train/contrastive_losses': 3.6795705556869507, 'train/num_steps': 4576, 'train/cos_sim_encoder_output': 0.94720877148211, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9951126389205456, 'train/probe_losses': 2.0605152448018393, 'train/probe_accs': 0.2877604166666667, 'test/probe_losses': 2.176486849784851, 'test/probe_accs': 0.28125, 'lr': 0.0002223552850015769, 'epoch': 142, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.9292054176330566
1 0.0 1.9430360794067383
2 0.1875 1.7755630016326904
3 0.4375 1.566989779472351
4 0.28125 1.6649056673049927
5 0.21875 1.9399162530899048
6 0.125 2.0638229846954346
7 0.25 1.893507957458496
8 0.3125 1.623886227607727
9 0.625 1.2602336406707764
10 0.5625 1.6221628189086914
11 0.53125 2.0712740421295166
test 0 0.125 2.1594762802124023
test 1 0.09375 2.238398551940918


Overall:  36%|▎| 144/400 [9:56:17<13:13:25, 185.96s/it, decoder_mask_ratio=0.75, epoch=143, lr=0.000221

{'train/loss': 3.527875155210495, 'train/recon_losses': nan, 'train/contrastive_losses': 3.527875155210495, 'train/num_steps': 4608, 'train/cos_sim_encoder_output': 0.9568948578089476, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9944921936839819, 'train/probe_losses': 1.7795419891675313, 'train/probe_accs': 0.2942708333333333, 'test/probe_losses': 2.19893741607666, 'test/probe_accs': 0.109375, 'lr': 0.00022129454458274, 'epoch': 143, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 2.2104573249816895
1 0.28125 2.012181282043457
2 0.5625 1.9805643558502197
3 0.6875 1.6200331449508667
4 0.59375 1.371658444404602
5 0.53125 1.1700739860534668
6 0.4375 1.229948878288269
7 0.59375 1.2397675514221191
8 0.3125 1.6235319375991821
9 0.28125 1.6702373027801514
10 0.375 1.4320050477981567
11 0.5625 1.1734659671783447
test 0 0.46875 2.292691707611084
test 1 0.25 2.421520948410034


Overall:  36%|▎| 145/400 [9:59:07<12:49:49, 181.13s/it, decoder_mask_ratio=0.75, epoch=144, lr=0.00022,

{'train/loss': 3.5601617246866226, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5601617246866226, 'train/num_steps': 4640, 'train/cos_sim_encoder_output': 0.9580676890909672, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9905644953250885, 'train/probe_losses': 1.5611604352792103, 'train/probe_accs': 0.4453125, 'test/probe_losses': 2.357106328010559, 'test/probe_accs': 0.359375, 'lr': 0.0002202291783524056, 'epoch': 144, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0625 2.1594390869140625
1 0.0625 2.15659236907959
2 0.03125 1.9486379623413086
3 0.71875 1.540400743484497
4 0.625 1.3858221769332886
5 0.4375 1.5658743381500244
6 0.28125 1.8862643241882324
7 0.3125 1.7296918630599976
8 0.1875 2.0998497009277344
9 0.15625 1.9611879587173462
10 0.1875 1.677514672279358
11 0.15625 1.6401631832122803
test 0 0.0 2.902461290359497
test 1 0.21875 2.558178424835205


Overall:  36%|▎| 146/400 [10:02:00<12:35:59, 178.58s/it, decoder_mask_ratio=0.75, epoch=145, lr=0.00021

{'train/loss': 3.5642743706703186, 'train/recon_losses': nan, 'train/contrastive_losses': 3.5642743706703186, 'train/num_steps': 4672, 'train/cos_sim_encoder_output': 0.9468261804431677, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9940595924854279, 'train/probe_losses': 1.8126198649406433, 'train/probe_accs': 0.2682291666666667, 'test/probe_losses': 2.730319857597351, 'test/probe_accs': 0.109375, 'lr': 0.00021915925544066164, 'epoch': 145, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.1875 2.0472934246063232
1 0.1875 2.064802885055542
2 0.21875 1.8956083059310913
3 0.71875 1.7173830270767212
4 0.46875 1.5961711406707764
5 0.90625 1.3283737897872925
6 0.65625 1.5171598196029663
7 0.359375 1.8349419832229614
8 0.8125 1.04160475730896
9 0.4375 1.6488678455352783
10 0.5 1.5122328996658325
11 0.4375 1.4892178773880005
test 0 0.125 2.3372254371643066


Overall:  36%|▎| 146/400 [10:05:06<12:35:59, 178.58s/it, decoder_mask_ratio=0.75, epoch=146, lr=0.00021

test 1 0.09375 2.1656219959259033
{'train/loss': 3.341368429362774, 'train/recon_losses': nan, 'train/contrastive_losses': 3.341368429362774, 'train/num_steps': 4704, 'train/cos_sim_encoder_output': 0.9436250980943441, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9928880129009485, 'train/probe_losses': 1.6411381463209789, 'train/probe_accs': 0.4908854166666667, 'test/probe_losses': 2.251423716545105, 'test/probe_accs': 0.109375, 'lr': 0.0002180848452732726, 'epoch': 146, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}


Overall:  37%|▎| 147/400 [10:05:06<12:43:21, 181.03s/it, decoder_mask_ratio=0.75, epoch=146, lr=0.00021

0 0.015625 2.3013205528259277
1 0.0625 2.13250470161438
2 0.25 1.9894956350326538
3 0.125 1.8370803594589233
4 0.71875 1.6804933547973633
5 0.125 1.78904390335083
6 0.3125 1.5201449394226074
7 0.3125 1.5107781887054443
8 0.34375 1.542107343673706
9 0.4375 2.0412397384643555
10 0.25 2.1682276725769043
11 0.1875 2.3555052280426025
test 0 0.0625 2.216663122177124


Overall:  37%|▎| 147/400 [10:07:54<12:43:21, 181.03s/it, decoder_mask_ratio=0.75, epoch=147, lr=0.00021

test 1 0.21875 2.0435564517974854
{'train/loss': 3.4712703600525856, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4712703600525856, 'train/num_steps': 4736, 'train/cos_sim_encoder_output': 0.9455128442496061, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9930119961500168, 'train/probe_losses': 1.9056618014971416, 'train/probe_accs': 0.26171875, 'test/probe_losses': 2.1301097869873047, 'test/probe_accs': 0.140625, 'lr': 0.00021700601756717455, 'epoch': 147, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}


Overall:  37%|▎| 148/400 [10:07:54<12:23:22, 176.99s/it, decoder_mask_ratio=0.75, epoch=147, lr=0.00021

0 0.1875 2.021967649459839
1 0.21875 2.0089027881622314
2 0.1875 1.9100011587142944
3 0.34375 1.9114004373550415
4 0.15625 2.139225482940674
5 0.28125 1.9842803478240967
6 0.71875 1.63773512840271
7 0.40625 1.5524283647537231
8 0.15625 1.6993190050125122
9 0.125 1.6428351402282715
10 0.0625 1.5878404378890991
11 0.09375 1.7203245162963867
test 0 0.25 1.600616693496704


Overall:  37%|▎| 148/400 [10:10:30<12:23:22, 176.99s/it, decoder_mask_ratio=0.75, epoch=148, lr=0.00021

test 1 0.25 1.5661929845809937
{'train/loss': 3.3849248066544533, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3849248066544533, 'train/num_steps': 4768, 'train/cos_sim_encoder_output': 0.9430176559835672, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9927121140062809, 'train/probe_losses': 1.8180217047532399, 'train/probe_accs': 0.24479166666666666, 'test/probe_losses': 1.5834048390388489, 'test/probe_accs': 0.25, 'lr': 0.00021592284232595135, 'epoch': 148, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}


Overall:  37%|▎| 149/400 [10:10:30<11:53:49, 170.64s/it, decoder_mask_ratio=0.75, epoch=148, lr=0.00021

0 0.0 2.159386157989502
1 0.25 2.037400722503662
2 0.15625 2.1077630519866943
3 0.4375 2.036036491394043
4 0.40625 1.656856894493103
5 0.25 1.7107412815093994
6 0.3125 1.8228318691253662
7 0.46875 1.5792685747146606
8 0.53125 1.353935956954956
9 0.625 1.599847674369812
10 0.625 1.4926410913467407
11 0.46875 1.878291368484497
test 0 0.78125 0.9268429279327393
test 1 0.5625 1.0537278652191162


Overall:  38%|▍| 150/400 [10:13:41<12:16:09, 176.68s/it, decoder_mask_ratio=0.75, epoch=149, lr=0.00021

{'train/loss': 3.3833518624305725, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3833518624305725, 'train/num_steps': 4800, 'train/cos_sim_encoder_output': 0.9298196993768215, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9928327742964029, 'train/probe_losses': 1.7862500945727031, 'train/probe_accs': 0.3776041666666667, 'test/probe_losses': 0.9902853965759277, 'test/probe_accs': 0.671875, 'lr': 0.00021483538983529214, 'epoch': 149, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.109375 2.077549934387207
1 0.34375 2.2033989429473877
2 0.34375 2.0369017124176025
3 0.3125 1.8252865076065063
4 0.1875 1.867558479309082
5 0.375 2.0416204929351807
6 0.15625 2.462765693664551
7 0.0625 2.463118076324463
8 0.09375 2.4848923683166504
9 0.0625 2.436509847640991
10 0.03125 2.7713983058929443
11 0.03125 2.5963149070739746
test 0 0.0 2.3970694541931152
test 1 0.0 2.289167881011963


Overall:  38%|▍| 151/400 [10:16:35<12:09:56, 175.89s/it, decoder_mask_ratio=0.75, epoch=150, lr=0.00021

{'train/loss': 3.444119483232498, 'train/recon_losses': nan, 'train/contrastive_losses': 3.444119483232498, 'train/num_steps': 4832, 'train/cos_sim_encoder_output': 0.9491137024015188, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.988353781402111, 'train/probe_losses': 2.2722762723763785, 'train/probe_accs': 0.17578125, 'test/probe_losses': 2.343118667602539, 'test/probe_accs': 0.0, 'lr': 0.00021374373065843064, 'epoch': 150, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.34375 2.0453951358795166
1 0.53125 1.884246587753296
2 0.28125 2.0305018424987793
3 0.25 1.8959139585494995
4 0.21875 1.7321423292160034
5 0.34375 1.4230153560638428
6 0.59375 1.3917977809906006
7 0.53125 1.2883834838867188
8 0.40625 1.2925689220428467
9 0.25 1.4868491888046265
10 0.296875 1.2107014656066895
11 0.3125 1.2404966354370117
test 0 0.0625 2.20743727684021
test 1 0.21875 1.6644933223724365


Overall:  38%|▍| 152/400 [10:19:53<12:34:55, 182.64s/it, decoder_mask_ratio=0.75, epoch=151, lr=0.00021

{'train/loss': 3.452360011637211, 'train/recon_losses': nan, 'train/contrastive_losses': 3.452360011637211, 'train/num_steps': 4864, 'train/cos_sim_encoder_output': 0.9499202035367489, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9946253318339586, 'train/probe_losses': 1.576834390560786, 'train/probe_accs': 0.36328125, 'test/probe_losses': 1.9359652996063232, 'test/probe_accs': 0.140625, 'lr': 0.00021264793563156647, 'epoch': 151, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 1.8376929759979248
1 0.21875 1.91834557056427
2 0.09375 1.8683286905288696
3 0.5 1.8692718744277954
4 0.5 1.7425167560577393
5 0.25 1.8328627347946167
6 0.3125 1.8740239143371582
7 0.125 2.192626476287842
8 0.15625 1.895354151725769
9 0.15625 2.2788829803466797
10 0.25 1.866962194442749
11 0.296875 1.9560916423797607
test 0 0.5 1.455859899520874
test 1 0.3125 1.6721705198287964


Overall:  38%|▍| 153/400 [10:22:43<12:15:48, 178.74s/it, decoder_mask_ratio=0.75, epoch=152, lr=0.00021

{'train/loss': 3.4362496361136436, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4362496361136436, 'train/num_steps': 4896, 'train/cos_sim_encoder_output': 0.9413013774901628, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9924666453152895, 'train/probe_losses': 1.9277466634909313, 'train/probe_accs': 0.2643229166666667, 'test/probe_losses': 1.5640152096748352, 'test/probe_accs': 0.40625, 'lr': 0.00021154807585926856, 'epoch': 152, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.4128258228302
1 0.0 2.2852721214294434
2 0.0 2.1793437004089355
3 0.59375 1.8517301082611084
4 0.34375 1.726970911026001
5 0.34375 1.764264464378357
6 0.46875 1.6802401542663574
7 0.21875 1.8893024921417236
8 0.40625 1.493200421333313
9 0.65625 1.5154025554656982
10 0.3125 2.134641170501709
11 0.28125 2.1964032649993896
test 0 0.46875 1.888759732246399
test 1 0.40625 1.8663195371627808


Overall:  38%|▍| 154/400 [10:26:12<12:50:07, 187.84s/it, decoder_mask_ratio=0.75, epoch=153, lr=0.00021

{'train/loss': 3.3703833743929863, 'train/recon_losses': nan, 'train/contrastive_losses': 3.3703833743929863, 'train/num_steps': 4928, 'train/cos_sim_encoder_output': 0.9460920840501785, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9881240073591471, 'train/probe_losses': 1.9274664322535198, 'train/probe_accs': 0.3020833333333333, 'test/probe_losses': 1.8775396347045898, 'test/probe_accs': 0.4375, 'lr': 0.0002104442227098613, 'epoch': 153, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.125 1.8568706512451172
1 0.21875 1.8139561414718628
2 0.40625 1.8207587003707886
3 0.21875 1.955634593963623
4 0.46875 1.686602234840393
5 0.53125 1.4632667303085327
6 0.71875 1.464805006980896
7 0.6875 1.7421345710754395
8 0.8125 1.3901258707046509
9 0.46875 2.1054487228393555
10 0.78125 1.418648362159729
11 0.78125 1.4701149463653564
test 0 0.0625 2.635953664779663
test 1 0.0 2.507460594177246


Overall:  39%|▍| 155/400 [10:29:10<12:35:05, 184.92s/it, decoder_mask_ratio=0.75, epoch=154, lr=0.00020

{'train/loss': 3.4254142493009567, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4254142493009567, 'train/num_steps': 4960, 'train/cos_sim_encoder_output': 0.9433202650398016, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.99085833132267, 'train/probe_losses': 1.6823638776938121, 'train/probe_accs': 0.5182291666666666, 'test/probe_losses': 2.5717071294784546, 'test/probe_accs': 0.03125, 'lr': 0.00020933644781079383, 'epoch': 154, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.03125 2.335935592651367
1 0.15625 2.1909704208374023
2 0.21875 2.0460257530212402
3 0.0625 2.0768322944641113
4 0.1875 1.9169683456420898
5 0.34375 1.8856674432754517
6 0.21875 1.7485344409942627
7 0.21875 1.8037400245666504
8 0.15625 1.8921881914138794
9 0.15625 1.8971010446548462
10 0.3125 1.7633696794509888
11 0.40625 1.688396692276001
test 0 0.34375 1.9827260971069336
test 1 0.28125 1.850521445274353


Overall:  39%|▍| 156/400 [10:31:56<12:09:18, 179.34s/it, decoder_mask_ratio=0.75, epoch=155, lr=0.00020

{'train/loss': 3.333754986524582, 'train/recon_losses': nan, 'train/contrastive_losses': 3.333754986524582, 'train/num_steps': 4992, 'train/cos_sim_encoder_output': 0.9308192562311888, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9866695199161768, 'train/probe_losses': 1.937144160270691, 'train/probe_accs': 0.20572916666666666, 'test/probe_losses': 1.9166237711906433, 'test/probe_accs': 0.3125, 'lr': 0.0002082248230439917, 'epoch': 155, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.3125 1.9148857593536377
1 0.15625 2.0909464359283447
2 0.5 1.9236921072006226
3 0.40625 1.8693162202835083
4 0.40625 1.8143682479858398
5 0.34375 1.7462586164474487
6 0.390625 1.8218835592269897
7 0.46875 1.8146735429763794
8 0.5625 1.7152185440063477
9 0.53125 1.782792329788208
10 0.40625 1.851741909980774
11 0.4375 1.7451975345611572
test 0 0.09375 2.2520663738250732
test 1 0.09375 1.9707399606704712


Overall:  39%|▍| 157/400 [10:35:06<12:19:03, 182.48s/it, decoder_mask_ratio=0.75, epoch=156, lr=0.00020

{'train/loss': 3.2920874655246735, 'train/recon_losses': nan, 'train/contrastive_losses': 3.2920874655246735, 'train/num_steps': 5024, 'train/cos_sim_encoder_output': 0.9320421852171421, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9927645567804575, 'train/probe_losses': 1.8409145673116047, 'train/probe_accs': 0.41015625, 'test/probe_losses': 2.111403167247772, 'test/probe_accs': 0.09375, 'lr': 0.0002071094205411931, 'epoch': 156, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 1.8127758502960205
1 0.0 1.8041911125183105
2 0.21875 1.6818362474441528
3 0.4375 1.5338114500045776
4 0.5 1.2824711799621582
5 0.53125 1.1071709394454956
6 0.25 1.9031177759170532
7 0.09375 2.295794725418091
8 0.28125 1.9919114112854004
9 0.21875 2.0767629146575928
10 0.21875 2.1937477588653564
11 0.375 1.8659111261367798
test 0 0.59375 1.473860740661621
test 1 0.59375 1.4997267723083496


Overall:  40%|▍| 158/400 [10:38:30<12:42:00, 188.93s/it, decoder_mask_ratio=0.75, epoch=157, lr=0.00020

{'train/loss': 3.4160773754119873, 'train/recon_losses': nan, 'train/contrastive_losses': 3.4160773754119873, 'train/num_steps': 5056, 'train/cos_sim_encoder_output': 0.9308901336044073, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9918597098439932, 'train/probe_losses': 1.7957918743292491, 'train/probe_accs': 0.2604166666666667, 'test/probe_losses': 1.4867937564849854, 'test/probe_accs': 0.59375, 'lr': 0.00020599031267926797, 'epoch': 157, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.269829511642456
1 0.0 2.2550277709960938
2 0.0 2.015758514404297
3 0.71875 1.7560291290283203
4 0.5625 1.7457854747772217
5 0.5 1.691430926322937
6 0.6875 1.3331081867218018
7 0.53125 1.417587161064148
8 0.5625 1.4594178199768066
9 0.59375 1.3802258968353271
10 0.5 1.528820276260376
11 0.5625 1.5241838693618774
test 0 0.15625 2.4127209186553955
test 1 0.125 2.3109467029571533


Overall:  40%|▍| 159/400 [10:42:52<14:06:54, 210.85s/it, decoder_mask_ratio=0.75, epoch=158, lr=0.00020

{'train/loss': 3.390756033360958, 'train/recon_losses': nan, 'train/contrastive_losses': 3.390756033360958, 'train/num_steps': 5088, 'train/cos_sim_encoder_output': 0.9431071057915688, 'train/cos_sim_decoder_output': nan, 'train/cos_sim_encoder_output_patchwise': 0.9917267020791769, 'train/probe_losses': 1.6981003781159718, 'train/probe_accs': 0.4348958333333333, 'test/probe_losses': 2.3618338108062744, 'test/probe_accs': 0.140625, 'lr': 0.00020486757207552174, 'epoch': 158, 'tube_mask_ratio': 0.75, 'decoder_mask_ratio': 0.75}
0 0.0 2.2317755222320557
1 0.0 2.1777665615081787
2 0.15625 1.9903466701507568
