In [82]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from transformers import GPT2Tokenizer, BatchEncoding, GPT2LMHeadModel, GPT2Config, GPT2ForSequenceClassification
from tqdm import tqdm
import torch as torch
import os
from torch.utils.data import Dataset
import wandb

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
path = "/om2/user/jackking/modular_transformers/scripts/dimensionality_reduction"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
wandb.login(key="a338f755915cccd861b14f29bf68601d8e1ec2c9")

#set seed
seed = 38
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjack-g-king[0m ([33mmodular_transformers[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/jackking/.netrc


In [83]:
class LMDataset(Dataset):
    def __init__(self, inputs, attn_mask=None, labels=None):
        #cast to tensors if not already tensors
        if not torch.is_tensor(inputs):
            inputs = torch.tensor(inputs)
        if not torch.is_tensor(labels):
            labels = torch.tensor(labels)
        if attn_mask is not None and not torch.is_tensor(attn_mask):
            attn_mask = torch.tensor(attn_mask)
            
        self.inputs = inputs
        self.attn_mask = attn_mask
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        if self.labels is None:
            item = {
                'input_ids': self.inputs[idx],
                'attention_mask': self.attn_mask[idx]}
        elif self.attn_mask is None:
            item = {
                'input_ids': self.inputs[idx],
                'labels': self.labels[idx]
            }
        else:
            item = {
                'input_ids': self.inputs[idx],
                'attention_mask': self.attn_mask[idx],
                'labels': self.labels[idx]
            }
        return item

def make_autoregressive_dataset(data):
    tokenizer.pad_token = tokenizer.eos_token
    dataset = tokenizer.batch_encode_plus(data, add_special_tokens=True, padding='longest', return_tensors="pt")
    inputs = dataset["input_ids"]
    attn_mask = dataset["attention_mask"]
    labels = dataset["input_ids"].clone()
    context_len = inputs.size(1)
    return LMDataset(inputs, attn_mask, labels), context_len

def make_classification_dataset(data1, data2):
    tokenizer.pad_token = tokenizer.eos_token
    len1 = len(data1)
    len2 = len(data2)
    combined = data1 + data2
    labels = [0]*len1 + [1]*len2
    dataset = tokenizer.batch_encode_plus(combined, add_special_tokens=True, padding='longest', return_tensors="pt")
    inputs = dataset["input_ids"]
    attn_mask = dataset["attention_mask"]
    context_len = inputs.size(1)
    return LMDataset(inputs, attn_mask, torch.tensor(labels)), context_len

In [84]:
def load_data(datatype, sub_datatype, classification, batch_size):

    train_data_path = f"{path}/data/{datatype}/train_data_{sub_datatype}.pkl"
    val_data_path = f"{path}/data/{datatype}/valid_data_{sub_datatype}.pkl"

    with open(train_data_path, "rb") as f:
        train_data = pickle.load(f)
    with open(val_data_path, "rb") as f:
        val_data = pickle.load(f)

    if sub_datatype == "natural":
        train_data = train_data
        val_data = val_data
        train_labels = train_data
        val_labels = val_data
    else:
        if classification:
            train_labels = train_data["labels"]
            val_labels = val_data["labels"]
        else:
            train_labels = train_data["inputs"]
            val_labels = val_data["inputs"]
        
        train_data = train_data["inputs"]
        val_data = val_data["inputs"]

    trainset = LMDataset(train_data, labels=train_labels)
    valset = LMDataset(val_data, labels=val_labels)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True)
    valloader = torch.utils.data.DataLoader(valset, batch_size=batch_size, shuffle=False)

    return trainloader, valloader

In [85]:
def load_model(embedding_dim, n_layer, n_head, resid_pdrop, embd_pdrop, attn_pdrop, classification, num_labels = None):
    model_config = GPT2Config(n_layer = n_layer, n_head = n_head, n_embd = embedding_dim, n_positions = 128,
                            resid_pdrop=resid_pdrop, embd_pdrop=embd_pdrop, attn_pdrop=attn_pdrop, num_labels=num_labels)
    if classification:
        model = GPT2ForSequenceClassification._from_config(model_config)
    else:
        model = GPT2LMHeadModel._from_config(model_config)

    model.config.pad_token_id = model.config.eos_token_id

    return model

In [86]:
def evaluate(model, valloader):
    model.eval()
    losses = []
    for step, batch in tqdm(enumerate(valloader), total=len(valloader)):
        with torch.no_grad():
            inputs = batch["input_ids"].to(device)
            if "attention_mask" in batch:
                attention_mask = batch["attention_mask"].to(device)
            else:
                attention_mask = None
            labels = batch["labels"].to(device)
            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
        losses.append(outputs.loss)
    loss = torch.mean(torch.stack(losses))
    return loss.item()

In [88]:
def train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader):
    wandb.init(project="dimensionality reduction", config=train_config)
    run_name = wandb.run.name

    save_path = f"{path}/models/{model_name}/{run_name}"
    save_epochs = train_config["num_epochs"] // 10
    save_epochs = 1

    for epoch in tqdm(range(train_config["num_epochs"])):
        model.train()
        torch.cuda.empty_cache()
        for step, batch in tqdm(enumerate(trainloader), total=len(trainloader)):
            optimizer.zero_grad()
            inputs = batch["input_ids"].to(device)
            if "attention_mask" in batch:
                attention_mask = batch["attention_mask"].to(device)
            else:
                attention_mask = None
            labels = batch["labels"].to(device)

            outputs = model(inputs, labels=labels, attention_mask=attention_mask)
            loss = outputs.loss 
            loss.backward()
            if train_config["lr_scheduler"] is not None:
                lr_scheduler.step()
            optimizer.step()

            wandb.log({"step": step + len(trainloader) * epoch})
            wandb.log({"loss": loss.item()})
            wandb.log({"learning_rate": optimizer.param_groups[0]['lr']})

        wandb.log({"epoch": epoch})
        val_loss = evaluate(model, valloader)
        wandb.log({"val_loss": val_loss})

        #save model
        if epoch % save_epochs == 0:
            model_dir = os.path.join(save_path, f"epoch_{epoch}")
            os.makedirs(model_dir, exist_ok=True)
            model.save_pretrained(model_dir)

    wandb.finish()

    #save model
    model_dir = os.path.join(save_path, "final_chkpoint")
    os.makedirs(model_dir, exist_ok=True)
    model.save_pretrained(model_dir)


In [91]:
classification = False
num_labels = 2

datatype = "natural_language"
sub_datatype = "natural"
batch_size = 128

embedding_dim = 768
n_layer = 12
n_head = 12
resid_pdrop = 0.1
embd_pdrop = 0.1
attn_pdrop = 0.1

model = load_model(embedding_dim, n_layer, n_head, resid_pdrop, embd_pdrop, attn_pdrop, classification, num_labels)
model.to(device)
trainloader, valloader = load_data(datatype, sub_datatype, classification, batch_size)
if not classification:
    model_type = "lm"
else:
    model_type = "class"

lr_scheduler = None
model_name = f"{datatype}/{sub_datatype}/{model_type}"
train_config = {"num_epochs": 20, "lr": 0.00005, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "datatype": datatype, "sub_datatype": sub_datatype, "num_labels": num_labels}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

  0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
classification = False
num_labels = 2

datatype = "natural_language"
sub_datatype = "fourgram_B"
batch_size = 128

embedding_dim = 768
n_layer = 12
n_head = 12
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2

model = load_model(embedding_dim, n_layer, n_head, resid_pdrop, embd_pdrop, attn_pdrop, classification, num_labels)
model.to(device)
trainloader, valloader = load_data(datatype, sub_datatype, classification, batch_size)
if not classification:
    model_type = "lm"
else:
    model_type = "class"

lr_scheduler = None
model_name = f"{datatype}/{sub_datatype}/{model_type}"
train_config = {"num_epochs": 100, "lr": 0.00005, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "datatype": datatype, "sub_datatype": sub_datatype, "num_labels": num_labels}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

  0%|          | 0/100 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]
100%|██████████| 40/40 [00:11<00:00,  3.55it/s]
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]]
100%|██████████| 40/40 [00:11<00:00,  3.56it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]]
100%|██████████| 40/40 [00:11<00:00,  3.56it/s]
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]]
100%|██████████| 

KeyboardInterrupt: 

In [None]:
classification = False
num_labels = 2

datatype = "toy"
sub_datatype = "fourgram_A"
batch_size = 128

embedding_dim = 768
n_layer = 12
n_head = 12
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2

model = load_model(embedding_dim, n_layer, n_head, resid_pdrop, embd_pdrop, attn_pdrop, classification, num_labels)
model.to(device)
trainloader, valloader = load_data(datatype, sub_datatype, classification, batch_size)
if not classification:
    model_type = "lm"
else:
    model_type = "class"

lr_scheduler = None
model_name = f"{datatype}/{sub_datatype}/{model_type}"
train_config = {"num_epochs": 50, "lr": 0.00001, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "datatype": datatype, "sub_datatype": sub_datatype, "num_labels": num_labels}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)



0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▇▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▄▄▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▁▂▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▇▆▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,44.0
learning_rate,5e-05
loss,2.18329
step,7158.0
val_loss,4.70727


100%|██████████| 157/157 [02:17<00:00,  1.14it/s]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]
100%|██████████| 40/40 [00:11<00:00,  3.56it/s]
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]
100%|██████████| 40/40 [00:11<00:00,  3.55it/s]
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]
100%|██████████| 40/40 [00:11<00:00,  3.55it/s]
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]
100%|██████████| 40/40 [00:11<00:00,  3.55it/s]
100%|██████████| 157/157 [02:18<00:00,  1.13it/s]
100%|██████████| 40/40 [00:11<00:00,  3.56it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157/157 [02:18<00:00,  1.14it/s]
100%|██████████| 40/40 [00:11<00:00,  3.57it/s]
100%|██████████| 157

0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▆▅▅▅▄▄▄▄▄▃▃▃▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▆▅▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃

0,1
epoch,49.0
learning_rate,5e-05
loss,1.84485
step,7849.0
val_loss,4.10728




0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▃▂▂▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,39.0
learning_rate,5e-05
loss,0.53211
step,5353.0
val_loss,0.53326


  8%|▊         | 13/157 [00:11<02:07,  1.13it/s]
  0%|          | 0/50 [00:11<?, ?it/s]


KeyboardInterrupt: 

: 

: 

In [14]:
classification = False
num_labels = 2

datatype = "fragment"
sub_datatype = "B"
batch_size = 128

embedding_dim = 768
n_layer = 12
n_head = 12
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2

model = load_model(embedding_dim, n_layer, n_head, resid_pdrop, embd_pdrop, attn_pdrop, classification, num_labels)
model.to(device)
trainloader, valloader = load_data(datatype, sub_datatype, classification, batch_size)
if not classification:
    model_type = "lm"
else:
    model_type = "class"

lr_scheduler = None
model_name = f"{datatype}/{sub_datatype}/{model_type}"
train_config = {"num_epochs": 100, "lr": 0.00005, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "datatype": datatype, "sub_datatype": sub_datatype, "num_labels": num_labels}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)

100%|██████████| 132/132 [01:55<00:00,  1.15it/s]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:55<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:55<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:55<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:55<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:56<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.58it/s]
100%|██████████| 132/132 [01:56<00:00,  1.13it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████████| 132/132 [01:56<00:00,  1.13it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████████| 132/132 [01:56<00:00,  1.13it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████████| 132/132 [01:56<00:00,  1.13it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████

KeyboardInterrupt: 

In [15]:
classification = False
num_labels = 2

datatype = "cycle"
sub_datatype = "B"
batch_size = 128

embedding_dim = 768
n_layer = 12
n_head = 12
resid_pdrop = 0.1
embd_pdrop = 0.2
attn_pdrop = 0.2

model = load_model(embedding_dim, n_layer, n_head, resid_pdrop, embd_pdrop, attn_pdrop, classification, num_labels)
model.to(device)
trainloader, valloader = load_data(datatype, sub_datatype, classification, batch_size)
if not classification:
    model_type = "lm"
else:
    model_type = "class"

lr_scheduler = None
model_name = f"{datatype}/{sub_datatype}/{model_type}"
train_config = {"num_epochs": 100, "lr": 0.00005, "lr_scheduler": lr_scheduler, "batch_size": batch_size, "resid_pdrop": resid_pdrop, "embd_pdrop": embd_pdrop, "n_head": n_head,
                "attn_pdrop": attn_pdrop, "model_name": model_name, "model_type": model_type, "embedding_dim": embedding_dim, "n_layer": n_layer, "datatype": datatype, "sub_datatype": sub_datatype, "num_labels": num_labels}

optimizer = torch.optim.Adam(model.parameters(), lr=train_config["lr"])
if train_config["lr_scheduler"] is not None:
    if train_config["lr_scheduler"] == "cosine_annealing":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"]*30)
    elif train_config["lr_scheduler"] == "cosine":
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_config["num_epochs"] * len(trainloader))
    elif train_config["lr_scheduler"] == "step":
        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.98)

train(model, optimizer, lr_scheduler, train_config, model_name, trainloader, valloader)



0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
learning_rate,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
loss,█▄▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▁▁▁▁▁▁▁▁▁▁
step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_loss,█▃▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▂▂▂▂

0,1
epoch,52.0
learning_rate,5e-05
loss,0.7083
step,7092.0
val_loss,0.81086


100%|██████████| 132/132 [01:56<00:00,  1.14it/s]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████████| 132/132 [01:56<00:00,  1.13it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████████| 132/132 [01:56<00:00,  1.13it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.58it/s]
100%|██████████| 132/132 [01:56<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:56<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:56<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:56<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:55<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:55<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.59it/s]
100%|██████████| 132/132 [01:56<00:00,  1.14it/s]]
100%|██████████| 33/33 [00:09<00:00,  3.57it/s]
100%|██████