In [1]:
# this mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'CS221/Project/'
FOLDERNAME = 'CS221/CS221Project/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/{}'.format(FOLDERNAME))
sys.path.append('/content/drive/My Drive/NonIntNLP/Python')
sys.path.append('/content/drive/My Drive/NonIntNLP/Python/dataloaders')
sys.path.append('/content/drive/My Drive/NonIntNLP/Python/trainers')
sys.path.append('/content/drive/My Drive/apex')


Mounted at /content/drive


In [2]:
! pip install transformers
! pip install fluentcheck

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/3a/83/e74092e7f24a08d751aa59b37a9fc572b2e4af3918cb66f7766c3affb1b4/transformers-3.5.1-py3-none-any.whl (1.3MB)
[K     |▎                               | 10kB 26.3MB/s eta 0:00:01[K     |▌                               | 20kB 34.6MB/s eta 0:00:01[K     |▊                               | 30kB 25.6MB/s eta 0:00:01[K     |█                               | 40kB 20.3MB/s eta 0:00:01[K     |█▎                              | 51kB 14.1MB/s eta 0:00:01[K     |█▌                              | 61kB 15.2MB/s eta 0:00:01[K     |█▊                              | 71kB 14.1MB/s eta 0:00:01[K     |██                              | 81kB 14.1MB/s eta 0:00:01[K     |██▎                             | 92kB 14.5MB/s eta 0:00:01[K     |██▌                             | 102kB 14.6MB/s eta 0:00:01[K     |██▊                             | 112kB 14.6MB/s eta 0:00:01[K     |███                             | 

In [3]:
# %cd /content/drive/My Drive
# ! git clone https://github.com/neonbjb/NonIntNLP.git
# ! ls '/content/drive/My Drive/NonIntNLP/Python/dataloaders'
# ! git clone https://www.github.com/nvidia/apex
# ! ls '/content/drive/My Drive/apex'
# ! python '/content/drive/My Drive/apex/setup.py'

In [4]:
from chunked_language_model_trainer import ChunkedLMTrainer
from chunked_text_dataloader import ChunkedTextDataset
from transformers import XLNetConfig, XLNetLMHeadModel, XLNetTokenizer

import json
import math
import os
import pandas as pd
import random
import torch
import transformers

In [5]:
dataset = pd.read_csv('/content/drive/My Drive/bbc_data_original.csv')

In [6]:
tok = XLNetTokenizer.from_pretrained("xlnet-base-cased")
output = []
for i in range(dataset["Text"].size):
    text = dataset["Text"].get(i)
    text_enc = tok.encode(
        text, add_special_tokens=False, max_length=None, pad_to_max_length=True
    )
    summary = dataset["Summary"].get(i)
    summary_enc = tok.encode(
        summary, add_special_tokens=False, max_length=None, pad_to_max_length=True
    )
    output.append({
        "text": torch.tensor(text_enc, dtype=torch.long).to('cuda'),
        "target": torch.tensor(summary_enc, dtype=torch.long).to('cuda'),
    })

split_index = math.ceil(len(output) * 0.9)
train = output[:split_index]
val = output[split_index:]

torch.save(train, "train.pt")
torch.save(val, "val.pt")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=798011.0, style=ProgressStyle(descripti…






In [7]:
chunked_model_config = {
    "name": 'model_test',
    "max_seq_len": 256,
    "model_name": 'xlnet-base-cased',
    "predict_len": 32,
    "batch_size": 1,
    "starting_lr": 0.01,
    "output_dir": '',
    "mem_len": 64,
}

In [8]:
# chunked_model_config is a dictionary initialized from command line arguments. Most fields
# are self-explanatory or can be inferred from the ChunkedTextDataset docs.
# Get the datasets
input_folder = "" # current folder of this script

train_set = ChunkedTextDataset(
    os.path.join(input_folder, "train.pt"),
    tok,
    chunked_model_config['max_seq_len'],
    chunked_model_config['predict_len'],
    add_pads_to_target=True,
)
val_set = ChunkedTextDataset(
    os.path.join(input_folder, "val.pt"),
    tok,
    chunked_model_config['max_seq_len'],
    chunked_model_config['predict_len'],
    add_pads_to_target=True,
)
train_loader = train_set.get_dataloader(chunked_model_config['batch_size'], num_workers=0)
val_loader = val_set.get_dataloader(chunked_model_config['batch_size'], num_workers=0)

In [9]:
config = XLNetConfig.from_pretrained(chunked_model_config["model_name"])
config.mem_len = chunked_model_config["mem_len"]
model = XLNetLMHeadModel.from_pretrained(chunked_model_config["model_name"]).cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=467042463.0, style=ProgressStyle(descri…




In [10]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0,
    },
    {
        "params": [
            p
            for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay": 0.0,
    },
]
epochs = 1
aggregate_batch_size = 1
optimizer = transformers.AdamW(optimizer_grouped_parameters, lr=0.1, eps=1e-6)
scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=epochs * len(train_set) / aggregate_batch_size,
)

In [11]:
trainer = ChunkedLMTrainer(
        model,
        chunked_model_config,
        train_loader,
        val_loader,
        optimizer,
        scheduler,
        device = 'cuda',
        is_fp16 = False,
        desired_batch_sz = aggregate_batch_size,
        do_wandb = False,
)

for _ in range(epochs):
    trainer.loop()



{'avg_chunks': 0.4, 'loss': 0.7438902854919434, 'learning_rate': 0.09995004995004995, 'optimizer_steps': 1}



Val Iteration:   0%|          | 0/222 [00:00<?, ?it/s][A
Val Iteration:   0%|          | 1/222 [00:00<00:28,  7.75it/s][A

Save completed. xlnet_trainer_checkpoints/chkpt_0



Val Iteration:   5%|▍         | 10/222 [00:00<00:19, 10.66it/s][A
Val Iteration:   6%|▌         | 13/222 [00:00<00:18, 11.32it/s][A
Val Iteration:   9%|▉         | 21/222 [00:00<00:13, 14.90it/s][A
Val Iteration:  14%|█▍        | 31/222 [00:00<00:10, 18.82it/s][A
Val Iteration:  18%|█▊        | 41/222 [00:01<00:07, 23.13it/s][A
Val Iteration:  23%|██▎       | 51/222 [00:01<00:05, 28.85it/s][A
Val Iteration:  27%|██▋       | 61/222 [00:01<00:04, 34.73it/s][A
Val Iteration:  32%|███▏      | 71/222 [00:01<00:04, 37.70it/s][A
Val Iteration:  36%|███▋      | 81/222 [00:01<00:03, 43.45it/s][A
Val Iteration:  41%|████      | 91/222 [00:01<00:03, 40.95it/s][A
Val Iteration:  45%|████▌     | 101/222 [00:02<00:02, 45.51it/s][A
Val Iteration:  50%|█████     | 111/222 [00:02<00:02, 50.04it/s][A
Val Iteration:  54%|█████▎    | 119/222 [00:02<00:01, 55.91it/s][A
Val Iteration:  57%|█████▋    | 126/222 [00:02<00:01, 49.72it/s][A
Val Iteration:  59%|█████▉    | 132/222 [00:02<00:02, 44.

Validation loss: 48.33228799571162


Train Iteration:   0%|          | 6/2002 [00:07<42:56,  1.29s/it]

{'avg_chunks': 2.4, 'loss': 103.04583511352538, 'learning_rate': 0.0997002997002997, 'optimizer_steps': 6}


Train Iteration:   1%|          | 11/2002 [00:08<13:37,  2.44it/s]

{'avg_chunks': 2.4, 'loss': 149.0715789794922, 'learning_rate': 0.09945054945054946, 'optimizer_steps': 11}


Train Iteration:   1%|          | 16/2002 [00:10<08:07,  4.07it/s]

{'avg_chunks': 2.4, 'loss': 215.9783142089844, 'learning_rate': 0.09920079920079922, 'optimizer_steps': 16}


Train Iteration:   1%|          | 21/2002 [00:11<07:39,  4.31it/s]

{'avg_chunks': 2.4, 'loss': 177.10672607421876, 'learning_rate': 0.09895104895104895, 'optimizer_steps': 21}


Train Iteration:   1%|▏         | 27/2002 [00:12<08:09,  4.04it/s]

{'avg_chunks': 3.2, 'loss': 144.3114807128906, 'learning_rate': 0.0987012987012987, 'optimizer_steps': 26}


Train Iteration:   2%|▏         | 31/2002 [00:14<08:13,  3.99it/s]

{'avg_chunks': 2.6, 'loss': 118.37667999267578, 'learning_rate': 0.09845154845154847, 'optimizer_steps': 31}


Train Iteration:   2%|▏         | 36/2002 [00:15<09:45,  3.36it/s]

{'avg_chunks': 2.8, 'loss': 55.73115234375, 'learning_rate': 0.09820179820179821, 'optimizer_steps': 36}


Train Iteration:   2%|▏         | 41/2002 [00:16<08:31,  3.83it/s]

{'avg_chunks': 2.6, 'loss': 53.54200820922851, 'learning_rate': 0.09795204795204795, 'optimizer_steps': 41}


Train Iteration:   2%|▏         | 46/2002 [00:17<08:32,  3.81it/s]

{'avg_chunks': 2.4, 'loss': 46.70757904052734, 'learning_rate': 0.09770229770229771, 'optimizer_steps': 46}


Train Iteration:   3%|▎         | 51/2002 [00:18<07:03,  4.61it/s]

{'avg_chunks': 2.0, 'loss': 36.16923141479492, 'learning_rate': 0.09745254745254746, 'optimizer_steps': 51}


Train Iteration:   3%|▎         | 57/2002 [00:20<07:16,  4.46it/s]

{'avg_chunks': 2.4, 'loss': 25.05093994140625, 'learning_rate': 0.0972027972027972, 'optimizer_steps': 56}


Train Iteration:   3%|▎         | 61/2002 [00:20<06:12,  5.21it/s]

{'avg_chunks': 1.6, 'loss': 25.31758842468262, 'learning_rate': 0.09695304695304696, 'optimizer_steps': 61}


Train Iteration:   3%|▎         | 66/2002 [00:22<07:12,  4.48it/s]

{'avg_chunks': 2.2, 'loss': 19.213733673095703, 'learning_rate': 0.0967032967032967, 'optimizer_steps': 66}


Train Iteration:   4%|▎         | 71/2002 [00:23<08:01,  4.01it/s]

{'avg_chunks': 2.8, 'loss': 20.511669921875, 'learning_rate': 0.09645354645354647, 'optimizer_steps': 71}


Train Iteration:   4%|▍         | 76/2002 [00:24<08:07,  3.95it/s]

{'avg_chunks': 2.4, 'loss': 22.820590591430665, 'learning_rate': 0.09620379620379621, 'optimizer_steps': 76}


Train Iteration:   4%|▍         | 81/2002 [00:26<09:27,  3.39it/s]

{'avg_chunks': 3.4, 'loss': 20.448147201538085, 'learning_rate': 0.09595404595404596, 'optimizer_steps': 81}


Train Iteration:   4%|▍         | 86/2002 [00:27<08:28,  3.77it/s]

{'avg_chunks': 2.6, 'loss': 20.763956832885743, 'learning_rate': 0.09570429570429571, 'optimizer_steps': 86}


Train Iteration:   5%|▍         | 91/2002 [00:28<07:26,  4.28it/s]

{'avg_chunks': 2.4, 'loss': 17.77301445007324, 'learning_rate': 0.09545454545454546, 'optimizer_steps': 91}


Train Iteration:   5%|▍         | 96/2002 [00:29<07:52,  4.03it/s]

{'avg_chunks': 2.4, 'loss': 18.1457576751709, 'learning_rate': 0.0952047952047952, 'optimizer_steps': 96}


Train Iteration:   5%|▌         | 101/2002 [00:31<07:38,  4.15it/s]

{'avg_chunks': 2.8, 'loss': 16.75584259033203, 'learning_rate': 0.09495504495504496, 'optimizer_steps': 101}


Train Iteration:   5%|▌         | 106/2002 [00:32<07:24,  4.27it/s]

{'avg_chunks': 2.2, 'loss': 16.66281394958496, 'learning_rate': 0.09470529470529471, 'optimizer_steps': 106}


Train Iteration:   6%|▌         | 112/2002 [00:33<06:56,  4.54it/s]

{'avg_chunks': 2.4, 'loss': 16.462391662597657, 'learning_rate': 0.09445554445554447, 'optimizer_steps': 111}


Train Iteration:   6%|▌         | 116/2002 [00:34<07:36,  4.13it/s]

{'avg_chunks': 2.2, 'loss': 12.727856254577636, 'learning_rate': 0.09420579420579421, 'optimizer_steps': 116}


Train Iteration:   6%|▌         | 121/2002 [00:35<07:08,  4.39it/s]

{'avg_chunks': 2.4, 'loss': 16.161770820617676, 'learning_rate': 0.09395604395604396, 'optimizer_steps': 121}


Train Iteration:   6%|▋         | 126/2002 [00:37<09:03,  3.45it/s]

{'avg_chunks': 3.0, 'loss': 16.003500938415527, 'learning_rate': 0.09370629370629371, 'optimizer_steps': 126}


Train Iteration:   7%|▋         | 131/2002 [00:38<06:50,  4.56it/s]

{'avg_chunks': 2.0, 'loss': 15.687211608886718, 'learning_rate': 0.09345654345654347, 'optimizer_steps': 131}


Train Iteration:   7%|▋         | 136/2002 [00:39<06:27,  4.82it/s]

{'avg_chunks': 2.0, 'loss': 14.002287101745605, 'learning_rate': 0.0932067932067932, 'optimizer_steps': 136}


Train Iteration:   7%|▋         | 141/2002 [00:40<07:49,  3.96it/s]

{'avg_chunks': 2.4, 'loss': 15.668413734436035, 'learning_rate': 0.09295704295704296, 'optimizer_steps': 141}


Train Iteration:   7%|▋         | 146/2002 [00:41<06:44,  4.59it/s]

{'avg_chunks': 2.0, 'loss': 16.51475238800049, 'learning_rate': 0.09270729270729272, 'optimizer_steps': 146}


Train Iteration:   8%|▊         | 151/2002 [00:42<06:48,  4.53it/s]

{'avg_chunks': 2.0, 'loss': 17.48291721343994, 'learning_rate': 0.09245754245754245, 'optimizer_steps': 151}


Train Iteration:   8%|▊         | 156/2002 [00:43<07:05,  4.34it/s]

{'avg_chunks': 2.2, 'loss': 16.19205799102783, 'learning_rate': 0.09220779220779221, 'optimizer_steps': 156}


Train Iteration:   8%|▊         | 161/2002 [00:44<07:13,  4.25it/s]

{'avg_chunks': 2.2, 'loss': 13.468327522277832, 'learning_rate': 0.09195804195804197, 'optimizer_steps': 161}


Train Iteration:   8%|▊         | 166/2002 [00:45<06:25,  4.76it/s]

{'avg_chunks': 2.0, 'loss': 14.923709297180176, 'learning_rate': 0.09170829170829171, 'optimizer_steps': 166}


Train Iteration:   9%|▊         | 171/2002 [00:46<05:56,  5.13it/s]

{'avg_chunks': 1.8, 'loss': 14.467556381225586, 'learning_rate': 0.09145854145854146, 'optimizer_steps': 171}


Train Iteration:   9%|▉         | 176/2002 [00:48<07:09,  4.25it/s]

{'avg_chunks': 2.6, 'loss': 14.528766059875489, 'learning_rate': 0.09120879120879122, 'optimizer_steps': 176}


Train Iteration:   9%|▉         | 181/2002 [00:49<07:46,  3.91it/s]

{'avg_chunks': 2.8, 'loss': 15.000835418701172, 'learning_rate': 0.09095904095904096, 'optimizer_steps': 181}


Train Iteration:   9%|▉         | 186/2002 [00:50<07:27,  4.05it/s]

{'avg_chunks': 2.6, 'loss': 13.10334587097168, 'learning_rate': 0.09070929070929072, 'optimizer_steps': 186}


Train Iteration:  10%|▉         | 191/2002 [00:51<07:19,  4.12it/s]

{'avg_chunks': 2.4, 'loss': 13.101942253112792, 'learning_rate': 0.09045954045954047, 'optimizer_steps': 191}


Train Iteration:  10%|▉         | 196/2002 [00:53<09:12,  3.27it/s]

{'avg_chunks': 3.4, 'loss': 12.50081844329834, 'learning_rate': 0.09020979020979021, 'optimizer_steps': 196}


Train Iteration:  10%|█         | 201/2002 [00:54<07:34,  3.97it/s]

{'avg_chunks': 2.6, 'loss': 12.555005264282226, 'learning_rate': 0.08996003996003997, 'optimizer_steps': 201}


Train Iteration:  10%|█         | 207/2002 [00:56<06:01,  4.97it/s]

{'avg_chunks': 2.2, 'loss': 11.578371620178222, 'learning_rate': 0.08971028971028971, 'optimizer_steps': 206}


Train Iteration:  11%|█         | 211/2002 [00:57<07:00,  4.25it/s]

{'avg_chunks': 2.2, 'loss': 12.472086334228516, 'learning_rate': 0.08946053946053946, 'optimizer_steps': 211}


Train Iteration:  11%|█         | 216/2002 [00:58<07:13,  4.12it/s]

{'avg_chunks': 2.4, 'loss': 11.450739479064941, 'learning_rate': 0.08921078921078922, 'optimizer_steps': 216}


Train Iteration:  11%|█         | 221/2002 [00:59<06:49,  4.35it/s]

{'avg_chunks': 2.4, 'loss': 11.593353080749512, 'learning_rate': 0.08896103896103896, 'optimizer_steps': 221}


Train Iteration:  11%|█▏        | 226/2002 [01:02<24:27,  1.21it/s]

{'avg_chunks': 5.0, 'loss': 10.556093215942383, 'learning_rate': 0.08871128871128872, 'optimizer_steps': 226}


Train Iteration:  12%|█▏        | 231/2002 [01:04<11:29,  2.57it/s]

{'avg_chunks': 3.0, 'loss': 12.746941566467285, 'learning_rate': 0.08846153846153847, 'optimizer_steps': 231}


Train Iteration:  12%|█▏        | 236/2002 [01:05<08:17,  3.55it/s]

{'avg_chunks': 2.6, 'loss': 12.37319507598877, 'learning_rate': 0.08821178821178821, 'optimizer_steps': 236}


Train Iteration:  12%|█▏        | 241/2002 [01:06<07:44,  3.79it/s]

{'avg_chunks': 2.8, 'loss': 11.462517738342285, 'learning_rate': 0.08796203796203797, 'optimizer_steps': 241}


Train Iteration:  12%|█▏        | 246/2002 [01:08<07:06,  4.12it/s]

{'avg_chunks': 2.2, 'loss': 11.306901741027833, 'learning_rate': 0.08771228771228773, 'optimizer_steps': 246}


Train Iteration:  13%|█▎        | 251/2002 [01:09<07:53,  3.70it/s]

{'avg_chunks': 2.8, 'loss': 10.132409858703614, 'learning_rate': 0.08746253746253746, 'optimizer_steps': 251}


Train Iteration:  13%|█▎        | 256/2002 [01:10<07:24,  3.93it/s]

{'avg_chunks': 2.4, 'loss': 10.689513778686523, 'learning_rate': 0.08721278721278722, 'optimizer_steps': 256}


Train Iteration:  13%|█▎        | 261/2002 [01:11<06:15,  4.63it/s]

{'avg_chunks': 2.0, 'loss': 10.772686767578126, 'learning_rate': 0.08696303696303698, 'optimizer_steps': 261}


Train Iteration:  13%|█▎        | 266/2002 [01:15<16:00,  1.81it/s]

{'avg_chunks': 5.6, 'loss': 11.228912544250488, 'learning_rate': 0.08671328671328671, 'optimizer_steps': 266}


Train Iteration:  14%|█▎        | 271/2002 [01:17<10:54,  2.65it/s]

{'avg_chunks': 3.4, 'loss': 10.403812599182128, 'learning_rate': 0.08646353646353647, 'optimizer_steps': 271}


Train Iteration:  14%|█▍        | 276/2002 [01:19<09:29,  3.03it/s]

{'avg_chunks': 3.2, 'loss': 10.87584228515625, 'learning_rate': 0.08621378621378623, 'optimizer_steps': 276}


Train Iteration:  14%|█▍        | 281/2002 [01:20<06:24,  4.48it/s]

{'avg_chunks': 1.8, 'loss': 10.525705337524414, 'learning_rate': 0.08596403596403597, 'optimizer_steps': 281}


Train Iteration:  14%|█▍        | 286/2002 [01:21<07:29,  3.82it/s]

{'avg_chunks': 2.6, 'loss': 10.7236328125, 'learning_rate': 0.08571428571428572, 'optimizer_steps': 286}


Train Iteration:  15%|█▍        | 291/2002 [01:22<06:54,  4.13it/s]

{'avg_chunks': 2.2, 'loss': 10.171555328369141, 'learning_rate': 0.08546453546453547, 'optimizer_steps': 291}


Train Iteration:  15%|█▍        | 296/2002 [01:23<06:12,  4.58it/s]

{'avg_chunks': 2.0, 'loss': 10.805049514770507, 'learning_rate': 0.08521478521478522, 'optimizer_steps': 296}


Train Iteration:  15%|█▌        | 301/2002 [01:24<07:57,  3.57it/s]

{'avg_chunks': 2.6, 'loss': 10.190899276733399, 'learning_rate': 0.08496503496503498, 'optimizer_steps': 301}


Train Iteration:  15%|█▌        | 306/2002 [01:26<07:03,  4.00it/s]

{'avg_chunks': 2.6, 'loss': 11.891876792907714, 'learning_rate': 0.08471528471528472, 'optimizer_steps': 306}


Train Iteration:  16%|█▌        | 311/2002 [01:27<05:55,  4.75it/s]

{'avg_chunks': 2.0, 'loss': 10.156708908081054, 'learning_rate': 0.08446553446553447, 'optimizer_steps': 311}


Train Iteration:  16%|█▌        | 316/2002 [01:28<07:32,  3.73it/s]

{'avg_chunks': 2.8, 'loss': 10.519412803649903, 'learning_rate': 0.08421578421578423, 'optimizer_steps': 316}


Train Iteration:  16%|█▌        | 321/2002 [01:29<06:41,  4.18it/s]

{'avg_chunks': 2.4, 'loss': 11.353215980529786, 'learning_rate': 0.08396603396603397, 'optimizer_steps': 321}


Train Iteration:  16%|█▋        | 326/2002 [01:33<12:02,  2.32it/s]

{'avg_chunks': 5.2, 'loss': 11.344393157958985, 'learning_rate': 0.08371628371628372, 'optimizer_steps': 326}


Train Iteration:  17%|█▋        | 331/2002 [01:34<07:26,  3.74it/s]

{'avg_chunks': 2.4, 'loss': 11.987795639038087, 'learning_rate': 0.08346653346653347, 'optimizer_steps': 331}


Train Iteration:  17%|█▋        | 336/2002 [01:35<07:18,  3.80it/s]

{'avg_chunks': 2.8, 'loss': 11.230937576293945, 'learning_rate': 0.08321678321678322, 'optimizer_steps': 336}


Train Iteration:  17%|█▋        | 341/2002 [01:37<06:54,  4.01it/s]

{'avg_chunks': 2.2, 'loss': 11.114351081848145, 'learning_rate': 0.08296703296703298, 'optimizer_steps': 341}


Train Iteration:  17%|█▋        | 346/2002 [01:38<08:11,  3.37it/s]

{'avg_chunks': 3.0, 'loss': 10.248744773864747, 'learning_rate': 0.08271728271728272, 'optimizer_steps': 346}


Train Iteration:  18%|█▊        | 351/2002 [01:40<09:08,  3.01it/s]

{'avg_chunks': 3.2, 'loss': 10.163054847717286, 'learning_rate': 0.08246753246753247, 'optimizer_steps': 351}


Train Iteration:  18%|█▊        | 356/2002 [01:41<07:08,  3.84it/s]

{'avg_chunks': 2.4, 'loss': 10.491398811340332, 'learning_rate': 0.08221778221778223, 'optimizer_steps': 356}


Train Iteration:  18%|█▊        | 361/2002 [01:42<06:19,  4.32it/s]

{'avg_chunks': 2.2, 'loss': 10.788904762268066, 'learning_rate': 0.08196803196803198, 'optimizer_steps': 361}


Train Iteration:  18%|█▊        | 366/2002 [01:43<07:00,  3.89it/s]

{'avg_chunks': 2.6, 'loss': 10.744265365600587, 'learning_rate': 0.08171828171828172, 'optimizer_steps': 366}


Train Iteration:  19%|█▊        | 372/2002 [01:45<05:33,  4.89it/s]

{'avg_chunks': 2.2, 'loss': 10.534098815917968, 'learning_rate': 0.08146853146853147, 'optimizer_steps': 371}


Train Iteration:  19%|█▉        | 376/2002 [01:46<06:23,  4.24it/s]

{'avg_chunks': 2.0, 'loss': 10.043969535827637, 'learning_rate': 0.08121878121878123, 'optimizer_steps': 376}


Train Iteration:  19%|█▉        | 382/2002 [01:47<06:02,  4.47it/s]

{'avg_chunks': 2.6, 'loss': 10.308920860290527, 'learning_rate': 0.08096903096903096, 'optimizer_steps': 381}


Train Iteration:  19%|█▉        | 386/2002 [01:48<05:52,  4.59it/s]

{'avg_chunks': 1.8, 'loss': 9.080949211120606, 'learning_rate': 0.08071928071928072, 'optimizer_steps': 386}


Train Iteration:  20%|█▉        | 391/2002 [01:49<06:21,  4.22it/s]

{'avg_chunks': 2.2, 'loss': 11.136988258361816, 'learning_rate': 0.08046953046953048, 'optimizer_steps': 391}


Train Iteration:  20%|█▉        | 397/2002 [01:50<05:23,  4.96it/s]

{'avg_chunks': 2.2, 'loss': 11.366916084289551, 'learning_rate': 0.08021978021978023, 'optimizer_steps': 396}


Train Iteration:  20%|██        | 401/2002 [01:51<06:20,  4.20it/s]

{'avg_chunks': 2.2, 'loss': 11.213338279724121, 'learning_rate': 0.07997002997002997, 'optimizer_steps': 401}


Train Iteration:  20%|██        | 406/2002 [01:53<05:53,  4.52it/s]

{'avg_chunks': 2.2, 'loss': 11.026898956298828, 'learning_rate': 0.07972027972027973, 'optimizer_steps': 406}


Train Iteration:  21%|██        | 411/2002 [01:54<06:30,  4.07it/s]

{'avg_chunks': 2.2, 'loss': 11.64288787841797, 'learning_rate': 0.07947052947052947, 'optimizer_steps': 411}


Train Iteration:  21%|██        | 416/2002 [01:55<05:44,  4.60it/s]

{'avg_chunks': 2.0, 'loss': 11.28707332611084, 'learning_rate': 0.07922077922077923, 'optimizer_steps': 416}


Train Iteration:  21%|██        | 421/2002 [01:56<06:23,  4.12it/s]

{'avg_chunks': 2.4, 'loss': 10.692462158203124, 'learning_rate': 0.07897102897102898, 'optimizer_steps': 421}


Train Iteration:  21%|██▏       | 427/2002 [01:58<07:30,  3.50it/s]

{'avg_chunks': 3.6, 'loss': 10.110586738586425, 'learning_rate': 0.07872127872127872, 'optimizer_steps': 426}


Train Iteration:  22%|██▏       | 431/2002 [02:00<10:49,  2.42it/s]

{'avg_chunks': 3.0, 'loss': 10.679300117492676, 'learning_rate': 0.07847152847152848, 'optimizer_steps': 431}


Train Iteration:  22%|██▏       | 436/2002 [02:01<08:08,  3.21it/s]

{'avg_chunks': 2.4, 'loss': 10.413250350952149, 'learning_rate': 0.07822177822177823, 'optimizer_steps': 436}


Train Iteration:  22%|██▏       | 441/2002 [02:02<06:37,  3.93it/s]

{'avg_chunks': 2.6, 'loss': 11.133613014221192, 'learning_rate': 0.07797202797202797, 'optimizer_steps': 441}


Train Iteration:  22%|██▏       | 446/2002 [02:04<08:40,  2.99it/s]

{'avg_chunks': 3.0, 'loss': 12.241972541809082, 'learning_rate': 0.07772227772227773, 'optimizer_steps': 446}


Train Iteration:  23%|██▎       | 451/2002 [02:05<06:48,  3.80it/s]

{'avg_chunks': 2.4, 'loss': 10.759056854248048, 'learning_rate': 0.07747252747252747, 'optimizer_steps': 451}


Train Iteration:  23%|██▎       | 456/2002 [02:06<06:21,  4.05it/s]

{'avg_chunks': 2.2, 'loss': 9.983380317687988, 'learning_rate': 0.07722277722277723, 'optimizer_steps': 456}


Train Iteration:  23%|██▎       | 461/2002 [02:08<07:27,  3.44it/s]

{'avg_chunks': 2.8, 'loss': 9.90760154724121, 'learning_rate': 0.07697302697302698, 'optimizer_steps': 461}


Train Iteration:  23%|██▎       | 467/2002 [02:09<05:57,  4.29it/s]

{'avg_chunks': 2.6, 'loss': 10.43996753692627, 'learning_rate': 0.07672327672327672, 'optimizer_steps': 466}


Train Iteration:  24%|██▎       | 471/2002 [02:10<07:00,  3.64it/s]

{'avg_chunks': 2.2, 'loss': 10.009335327148438, 'learning_rate': 0.07647352647352648, 'optimizer_steps': 471}


Train Iteration:  24%|██▍       | 476/2002 [02:11<07:06,  3.57it/s]

{'avg_chunks': 2.4, 'loss': 11.31336269378662, 'learning_rate': 0.07622377622377623, 'optimizer_steps': 476}


Train Iteration:  24%|██▍       | 481/2002 [02:13<05:51,  4.33it/s]

{'avg_chunks': 2.0, 'loss': 9.947433853149414, 'learning_rate': 0.07597402597402597, 'optimizer_steps': 481}


Train Iteration:  24%|██▍       | 486/2002 [02:14<06:17,  4.02it/s]

{'avg_chunks': 2.6, 'loss': 10.478490447998047, 'learning_rate': 0.07572427572427573, 'optimizer_steps': 486}


Train Iteration:  25%|██▍       | 491/2002 [02:15<07:22,  3.42it/s]

{'avg_chunks': 2.8, 'loss': 9.582706642150878, 'learning_rate': 0.07547452547452549, 'optimizer_steps': 491}


Train Iteration:  25%|██▍       | 496/2002 [02:17<07:14,  3.47it/s]

{'avg_chunks': 2.6, 'loss': 10.231042861938477, 'learning_rate': 0.07522477522477522, 'optimizer_steps': 496}


Train Iteration:  25%|██▌       | 501/2002 [02:18<06:42,  3.73it/s]

{'avg_chunks': 2.4, 'loss': 9.85688419342041, 'learning_rate': 0.07497502497502498, 'optimizer_steps': 501}


Train Iteration:  25%|██▌       | 507/2002 [02:19<04:49,  5.17it/s]

{'avg_chunks': 1.8, 'loss': 10.447723770141602, 'learning_rate': 0.07472527472527472, 'optimizer_steps': 506}


Train Iteration:  26%|██▌       | 511/2002 [02:20<06:49,  3.64it/s]

{'avg_chunks': 2.4, 'loss': 11.022438430786133, 'learning_rate': 0.07447552447552448, 'optimizer_steps': 511}


Train Iteration:  26%|██▌       | 516/2002 [02:21<06:06,  4.05it/s]

{'avg_chunks': 2.2, 'loss': 10.108757781982423, 'learning_rate': 0.07422577422577423, 'optimizer_steps': 516}


Train Iteration:  26%|██▌       | 521/2002 [02:23<06:24,  3.85it/s]

{'avg_chunks': 2.6, 'loss': 10.230106544494628, 'learning_rate': 0.07397602397602397, 'optimizer_steps': 521}


Train Iteration:  26%|██▋       | 526/2002 [02:24<05:41,  4.32it/s]

{'avg_chunks': 2.2, 'loss': 10.151172828674316, 'learning_rate': 0.07372627372627373, 'optimizer_steps': 526}


Train Iteration:  27%|██▋       | 531/2002 [02:25<06:08,  3.99it/s]

{'avg_chunks': 2.4, 'loss': 9.393760490417481, 'learning_rate': 0.07347652347652349, 'optimizer_steps': 531}


Train Iteration:  27%|██▋       | 536/2002 [02:26<05:27,  4.48it/s]

{'avg_chunks': 2.0, 'loss': 11.116726875305176, 'learning_rate': 0.07322677322677322, 'optimizer_steps': 536}


Train Iteration:  27%|██▋       | 541/2002 [02:27<05:47,  4.20it/s]

{'avg_chunks': 2.2, 'loss': 10.115573310852051, 'learning_rate': 0.07297702297702298, 'optimizer_steps': 541}


Train Iteration:  27%|██▋       | 546/2002 [02:29<06:22,  3.81it/s]

{'avg_chunks': 3.0, 'loss': 10.696570777893067, 'learning_rate': 0.07272727272727274, 'optimizer_steps': 546}


Train Iteration:  28%|██▊       | 551/2002 [02:30<07:28,  3.24it/s]

{'avg_chunks': 2.6, 'loss': 11.365528297424316, 'learning_rate': 0.07247752247752248, 'optimizer_steps': 551}


Train Iteration:  28%|██▊       | 556/2002 [02:31<06:01,  4.00it/s]

{'avg_chunks': 2.2, 'loss': 10.416645050048828, 'learning_rate': 0.07222777222777223, 'optimizer_steps': 556}


Train Iteration:  28%|██▊       | 561/2002 [02:32<06:04,  3.96it/s]

{'avg_chunks': 2.4, 'loss': 10.027690124511718, 'learning_rate': 0.07197802197802199, 'optimizer_steps': 561}


Train Iteration:  28%|██▊       | 566/2002 [02:34<05:42,  4.19it/s]

{'avg_chunks': 2.4, 'loss': 11.312941741943359, 'learning_rate': 0.07172827172827173, 'optimizer_steps': 566}


Train Iteration:  29%|██▊       | 571/2002 [02:35<04:59,  4.78it/s]

{'avg_chunks': 2.0, 'loss': 11.215078735351563, 'learning_rate': 0.07147852147852149, 'optimizer_steps': 571}


Train Iteration:  29%|██▉       | 576/2002 [02:36<05:50,  4.07it/s]

{'avg_chunks': 2.4, 'loss': 9.941338157653808, 'learning_rate': 0.07122877122877123, 'optimizer_steps': 576}


Train Iteration:  29%|██▉       | 581/2002 [02:37<05:55,  3.99it/s]

{'avg_chunks': 2.4, 'loss': 11.437926292419434, 'learning_rate': 0.07097902097902098, 'optimizer_steps': 581}


Train Iteration:  29%|██▉       | 586/2002 [02:38<05:08,  4.59it/s]

{'avg_chunks': 2.0, 'loss': 10.473114967346191, 'learning_rate': 0.07072927072927074, 'optimizer_steps': 586}


Train Iteration:  30%|██▉       | 591/2002 [02:39<05:30,  4.27it/s]

{'avg_chunks': 2.4, 'loss': 10.845776748657226, 'learning_rate': 0.07047952047952048, 'optimizer_steps': 591}


Train Iteration:  30%|██▉       | 596/2002 [02:41<07:25,  3.16it/s]

{'avg_chunks': 3.0, 'loss': 10.284721374511719, 'learning_rate': 0.07022977022977023, 'optimizer_steps': 596}


Train Iteration:  30%|███       | 601/2002 [02:43<08:05,  2.89it/s]

{'avg_chunks': 3.4, 'loss': 10.603668594360352, 'learning_rate': 0.06998001998001999, 'optimizer_steps': 601}


Train Iteration:  30%|███       | 606/2002 [02:44<05:37,  4.14it/s]

{'avg_chunks': 2.2, 'loss': 10.64733009338379, 'learning_rate': 0.06973026973026973, 'optimizer_steps': 606}


Train Iteration:  31%|███       | 611/2002 [02:45<05:00,  4.63it/s]

{'avg_chunks': 2.0, 'loss': 10.42613410949707, 'learning_rate': 0.06948051948051948, 'optimizer_steps': 611}


Train Iteration:  31%|███       | 616/2002 [02:46<04:35,  5.04it/s]

{'avg_chunks': 1.8, 'loss': 9.89682388305664, 'learning_rate': 0.06923076923076923, 'optimizer_steps': 616}


Train Iteration:  31%|███       | 621/2002 [02:47<05:22,  4.28it/s]

{'avg_chunks': 2.2, 'loss': 10.53381061553955, 'learning_rate': 0.06898101898101898, 'optimizer_steps': 621}


Train Iteration:  31%|███▏      | 626/2002 [02:49<06:55,  3.31it/s]

{'avg_chunks': 3.0, 'loss': 10.405770111083985, 'learning_rate': 0.06873126873126874, 'optimizer_steps': 626}


Train Iteration:  32%|███▏      | 631/2002 [02:50<05:47,  3.95it/s]

{'avg_chunks': 2.2, 'loss': 9.340214729309082, 'learning_rate': 0.06848151848151848, 'optimizer_steps': 631}


Train Iteration:  32%|███▏      | 636/2002 [02:51<06:17,  3.61it/s]

{'avg_chunks': 2.8, 'loss': 10.023968696594238, 'learning_rate': 0.06823176823176823, 'optimizer_steps': 636}


Train Iteration:  32%|███▏      | 641/2002 [02:53<06:13,  3.64it/s]

{'avg_chunks': 2.8, 'loss': 9.516851806640625, 'learning_rate': 0.06798201798201799, 'optimizer_steps': 641}


Train Iteration:  32%|███▏      | 646/2002 [02:54<06:01,  3.75it/s]

{'avg_chunks': 2.6, 'loss': 10.044678115844727, 'learning_rate': 0.06773226773226775, 'optimizer_steps': 646}


Train Iteration:  33%|███▎      | 651/2002 [02:55<05:36,  4.02it/s]

{'avg_chunks': 2.4, 'loss': 9.80593662261963, 'learning_rate': 0.06748251748251748, 'optimizer_steps': 651}


Train Iteration:  33%|███▎      | 656/2002 [02:57<06:46,  3.31it/s]

{'avg_chunks': 3.2, 'loss': 9.730797004699706, 'learning_rate': 0.06723276723276723, 'optimizer_steps': 656}


Train Iteration:  33%|███▎      | 661/2002 [02:58<05:28,  4.08it/s]

{'avg_chunks': 2.4, 'loss': 9.576017379760742, 'learning_rate': 0.066983016983017, 'optimizer_steps': 661}


Train Iteration:  33%|███▎      | 666/2002 [02:59<06:10,  3.61it/s]

{'avg_chunks': 2.6, 'loss': 9.270791053771973, 'learning_rate': 0.06673326673326674, 'optimizer_steps': 666}


Train Iteration:  34%|███▎      | 671/2002 [03:01<05:52,  3.77it/s]

{'avg_chunks': 2.6, 'loss': 10.368344688415528, 'learning_rate': 0.06648351648351648, 'optimizer_steps': 671}


Train Iteration:  34%|███▍      | 677/2002 [03:02<04:22,  5.06it/s]

{'avg_chunks': 1.8, 'loss': 9.906032371520997, 'learning_rate': 0.06623376623376624, 'optimizer_steps': 676}


Train Iteration:  34%|███▍      | 681/2002 [03:03<05:16,  4.18it/s]

{'avg_chunks': 2.2, 'loss': 10.231845664978028, 'learning_rate': 0.06598401598401599, 'optimizer_steps': 681}


Train Iteration:  34%|███▍      | 686/2002 [03:04<06:48,  3.22it/s]

{'avg_chunks': 3.2, 'loss': 10.12775936126709, 'learning_rate': 0.06573426573426573, 'optimizer_steps': 686}


Train Iteration:  35%|███▍      | 691/2002 [03:06<05:30,  3.96it/s]

{'avg_chunks': 2.2, 'loss': 9.638513565063477, 'learning_rate': 0.06548451548451549, 'optimizer_steps': 691}


Train Iteration:  35%|███▍      | 696/2002 [03:06<04:05,  5.33it/s]

{'avg_chunks': 1.6, 'loss': 10.991371345520019, 'learning_rate': 0.06523476523476524, 'optimizer_steps': 696}


Train Iteration:  35%|███▌      | 701/2002 [03:08<05:25,  4.00it/s]

{'avg_chunks': 2.6, 'loss': 9.689460372924804, 'learning_rate': 0.064985014985015, 'optimizer_steps': 701}


Train Iteration:  35%|███▌      | 706/2002 [03:09<05:07,  4.21it/s]

{'avg_chunks': 2.4, 'loss': 10.122881507873535, 'learning_rate': 0.06473526473526474, 'optimizer_steps': 706}


Train Iteration:  36%|███▌      | 711/2002 [03:10<05:50,  3.68it/s]

{'avg_chunks': 2.6, 'loss': 9.748155975341797, 'learning_rate': 0.06448551448551448, 'optimizer_steps': 711}


Train Iteration:  36%|███▌      | 717/2002 [03:12<04:32,  4.72it/s]

{'avg_chunks': 2.0, 'loss': 10.333506011962891, 'learning_rate': 0.06423576423576424, 'optimizer_steps': 716}


Train Iteration:  36%|███▌      | 721/2002 [03:13<05:31,  3.86it/s]

{'avg_chunks': 2.4, 'loss': 9.4877610206604, 'learning_rate': 0.06398601398601399, 'optimizer_steps': 721}


Train Iteration:  36%|███▋      | 727/2002 [03:14<04:25,  4.80it/s]

{'avg_chunks': 2.2, 'loss': 9.770818901062011, 'learning_rate': 0.06373626373626373, 'optimizer_steps': 726}


Train Iteration:  37%|███▋      | 731/2002 [03:15<04:43,  4.48it/s]

{'avg_chunks': 1.8, 'loss': 9.758880615234375, 'learning_rate': 0.06348651348651349, 'optimizer_steps': 731}


Train Iteration:  37%|███▋      | 736/2002 [03:16<04:50,  4.35it/s]

{'avg_chunks': 2.2, 'loss': 10.207368850708008, 'learning_rate': 0.06323676323676324, 'optimizer_steps': 736}


Train Iteration:  37%|███▋      | 741/2002 [03:18<06:32,  3.21it/s]

{'avg_chunks': 3.2, 'loss': 9.231380844116211, 'learning_rate': 0.062987012987013, 'optimizer_steps': 741}


Train Iteration:  37%|███▋      | 746/2002 [03:19<04:53,  4.28it/s]

{'avg_chunks': 2.2, 'loss': 9.878170013427734, 'learning_rate': 0.06273726273726274, 'optimizer_steps': 746}


Train Iteration:  38%|███▊      | 751/2002 [03:20<05:10,  4.03it/s]

{'avg_chunks': 2.6, 'loss': 8.867762851715089, 'learning_rate': 0.06248751248751249, 'optimizer_steps': 751}


Train Iteration:  38%|███▊      | 756/2002 [03:21<04:30,  4.60it/s]

{'avg_chunks': 2.0, 'loss': 9.829474258422852, 'learning_rate': 0.06223776223776224, 'optimizer_steps': 756}


Train Iteration:  38%|███▊      | 761/2002 [03:23<05:10,  4.00it/s]

{'avg_chunks': 2.6, 'loss': 9.065997505187989, 'learning_rate': 0.061988011988011994, 'optimizer_steps': 761}


Train Iteration:  38%|███▊      | 766/2002 [03:24<04:28,  4.60it/s]

{'avg_chunks': 2.0, 'loss': 9.417056274414062, 'learning_rate': 0.06173826173826174, 'optimizer_steps': 766}


Train Iteration:  39%|███▊      | 771/2002 [03:25<04:53,  4.20it/s]

{'avg_chunks': 2.4, 'loss': 9.132866859436035, 'learning_rate': 0.06148851148851149, 'optimizer_steps': 771}


Train Iteration:  39%|███▉      | 776/2002 [03:26<04:43,  4.32it/s]

{'avg_chunks': 2.4, 'loss': 9.080833244323731, 'learning_rate': 0.06123876123876124, 'optimizer_steps': 776}


Train Iteration:  39%|███▉      | 781/2002 [03:27<04:28,  4.55it/s]

{'avg_chunks': 2.2, 'loss': 10.297702026367187, 'learning_rate': 0.060989010989010994, 'optimizer_steps': 781}


Train Iteration:  39%|███▉      | 786/2002 [03:29<05:36,  3.61it/s]

{'avg_chunks': 2.6, 'loss': 10.039156723022462, 'learning_rate': 0.06073926073926074, 'optimizer_steps': 786}


Train Iteration:  40%|███▉      | 791/2002 [03:30<05:13,  3.86it/s]

{'avg_chunks': 2.4, 'loss': 9.524635696411133, 'learning_rate': 0.06048951048951049, 'optimizer_steps': 791}


Train Iteration:  40%|███▉      | 796/2002 [03:31<04:53,  4.11it/s]

{'avg_chunks': 2.2, 'loss': 9.71751365661621, 'learning_rate': 0.06023976023976024, 'optimizer_steps': 796}


Train Iteration:  40%|████      | 801/2002 [03:32<05:42,  3.51it/s]

{'avg_chunks': 2.6, 'loss': 9.042756080627441, 'learning_rate': 0.05999000999000999, 'optimizer_steps': 801}


Train Iteration:  40%|████      | 806/2002 [03:33<05:02,  3.95it/s]

{'avg_chunks': 2.4, 'loss': 9.655180358886719, 'learning_rate': 0.05974025974025974, 'optimizer_steps': 806}


Train Iteration:  41%|████      | 811/2002 [03:35<04:19,  4.58it/s]

{'avg_chunks': 2.0, 'loss': 9.71581859588623, 'learning_rate': 0.05949050949050949, 'optimizer_steps': 811}


Train Iteration:  41%|████      | 816/2002 [03:36<04:32,  4.35it/s]

{'avg_chunks': 2.2, 'loss': 9.16224250793457, 'learning_rate': 0.05924075924075925, 'optimizer_steps': 816}


Train Iteration:  41%|████      | 821/2002 [03:37<05:16,  3.73it/s]

{'avg_chunks': 3.0, 'loss': 9.467046356201172, 'learning_rate': 0.05899100899100899, 'optimizer_steps': 821}


Train Iteration:  41%|████▏     | 826/2002 [03:38<05:01,  3.90it/s]

{'avg_chunks': 2.6, 'loss': 9.6323673248291, 'learning_rate': 0.05874125874125874, 'optimizer_steps': 826}


Train Iteration:  42%|████▏     | 831/2002 [03:40<05:01,  3.89it/s]

{'avg_chunks': 2.4, 'loss': 9.914602470397949, 'learning_rate': 0.0584915084915085, 'optimizer_steps': 831}


Train Iteration:  42%|████▏     | 836/2002 [03:41<05:09,  3.77it/s]

{'avg_chunks': 2.6, 'loss': 9.668305587768554, 'learning_rate': 0.05824175824175825, 'optimizer_steps': 836}


Train Iteration:  42%|████▏     | 841/2002 [03:42<04:45,  4.06it/s]

{'avg_chunks': 2.2, 'loss': 9.07628574371338, 'learning_rate': 0.05799200799200799, 'optimizer_steps': 841}


Train Iteration:  42%|████▏     | 846/2002 [03:43<04:26,  4.34it/s]

{'avg_chunks': 2.4, 'loss': 9.36173038482666, 'learning_rate': 0.05774225774225775, 'optimizer_steps': 846}


Train Iteration:  43%|████▎     | 851/2002 [03:45<06:16,  3.05it/s]

{'avg_chunks': 3.4, 'loss': 9.241529846191407, 'learning_rate': 0.0574925074925075, 'optimizer_steps': 851}


Train Iteration:  43%|████▎     | 856/2002 [03:47<06:21,  3.01it/s]

{'avg_chunks': 3.4, 'loss': 8.672313976287843, 'learning_rate': 0.05724275724275725, 'optimizer_steps': 856}


Train Iteration:  43%|████▎     | 861/2002 [03:48<05:11,  3.66it/s]

{'avg_chunks': 2.6, 'loss': 9.165144538879394, 'learning_rate': 0.056993006993006995, 'optimizer_steps': 861}


Train Iteration:  43%|████▎     | 866/2002 [03:50<06:44,  2.81it/s]

{'avg_chunks': 3.4, 'loss': 9.213097381591798, 'learning_rate': 0.05674325674325675, 'optimizer_steps': 866}


Train Iteration:  44%|████▎     | 871/2002 [03:51<04:35,  4.11it/s]

{'avg_chunks': 2.2, 'loss': 8.591648864746094, 'learning_rate': 0.0564935064935065, 'optimizer_steps': 871}


Train Iteration:  44%|████▍     | 876/2002 [03:53<05:20,  3.51it/s]

{'avg_chunks': 3.2, 'loss': 9.165218925476074, 'learning_rate': 0.05624375624375625, 'optimizer_steps': 876}


Train Iteration:  44%|████▍     | 881/2002 [03:54<05:12,  3.59it/s]

{'avg_chunks': 2.8, 'loss': 9.04231243133545, 'learning_rate': 0.055994005994005995, 'optimizer_steps': 881}


Train Iteration:  44%|████▍     | 886/2002 [03:55<04:20,  4.28it/s]

{'avg_chunks': 2.2, 'loss': 8.951912117004394, 'learning_rate': 0.05574425574425575, 'optimizer_steps': 886}


Train Iteration:  45%|████▍     | 891/2002 [03:56<03:58,  4.65it/s]

{'avg_chunks': 2.0, 'loss': 9.340393447875977, 'learning_rate': 0.0554945054945055, 'optimizer_steps': 891}


Train Iteration:  45%|████▍     | 896/2002 [03:58<04:59,  3.69it/s]

{'avg_chunks': 2.8, 'loss': 9.006371688842773, 'learning_rate': 0.05524475524475524, 'optimizer_steps': 896}


Train Iteration:  45%|████▌     | 901/2002 [03:59<04:35,  4.00it/s]

{'avg_chunks': 2.4, 'loss': 9.551518058776855, 'learning_rate': 0.054995004995004995, 'optimizer_steps': 901}


Train Iteration:  45%|████▌     | 906/2002 [04:00<04:19,  4.22it/s]

{'avg_chunks': 2.2, 'loss': 9.305575752258301, 'learning_rate': 0.05474525474525475, 'optimizer_steps': 906}


Train Iteration:  46%|████▌     | 911/2002 [04:01<04:30,  4.03it/s]

{'avg_chunks': 2.4, 'loss': 9.08197193145752, 'learning_rate': 0.0544955044955045, 'optimizer_steps': 911}


Train Iteration:  46%|████▌     | 916/2002 [04:03<04:46,  3.79it/s]

{'avg_chunks': 2.6, 'loss': 9.91202907562256, 'learning_rate': 0.054245754245754244, 'optimizer_steps': 916}


Train Iteration:  46%|████▌     | 921/2002 [04:04<05:21,  3.36it/s]

{'avg_chunks': 2.8, 'loss': 9.066930675506592, 'learning_rate': 0.053996003996003995, 'optimizer_steps': 921}


Train Iteration:  46%|████▋     | 926/2002 [04:05<05:05,  3.53it/s]

{'avg_chunks': 2.8, 'loss': 9.114797878265382, 'learning_rate': 0.05374625374625375, 'optimizer_steps': 926}


Train Iteration:  47%|████▋     | 932/2002 [04:07<04:16,  4.18it/s]

{'avg_chunks': 2.6, 'loss': 8.898641014099121, 'learning_rate': 0.053496503496503506, 'optimizer_steps': 931}


Train Iteration:  47%|████▋     | 936/2002 [04:08<04:47,  3.71it/s]

{'avg_chunks': 2.6, 'loss': 8.99415988922119, 'learning_rate': 0.053246753246753244, 'optimizer_steps': 936}


Train Iteration:  47%|████▋     | 941/2002 [04:09<04:09,  4.26it/s]

{'avg_chunks': 2.2, 'loss': 9.152545356750489, 'learning_rate': 0.052997002997002995, 'optimizer_steps': 941}


Train Iteration:  47%|████▋     | 946/2002 [04:10<04:14,  4.15it/s]

{'avg_chunks': 2.4, 'loss': 9.141690444946288, 'learning_rate': 0.052747252747252754, 'optimizer_steps': 946}


Train Iteration:  48%|████▊     | 951/2002 [04:12<04:53,  3.58it/s]

{'avg_chunks': 3.0, 'loss': 9.36220417022705, 'learning_rate': 0.052497502497502506, 'optimizer_steps': 951}


Train Iteration:  48%|████▊     | 956/2002 [04:13<04:22,  3.98it/s]

{'avg_chunks': 2.4, 'loss': 9.882191276550293, 'learning_rate': 0.052247752247752244, 'optimizer_steps': 956}


Train Iteration:  48%|████▊     | 961/2002 [04:15<04:50,  3.58it/s]

{'avg_chunks': 3.0, 'loss': 8.794253158569337, 'learning_rate': 0.051998001998002, 'optimizer_steps': 961}


Train Iteration:  48%|████▊     | 966/2002 [04:16<04:21,  3.96it/s]

{'avg_chunks': 2.4, 'loss': 8.655412769317627, 'learning_rate': 0.051748251748251754, 'optimizer_steps': 966}


Train Iteration:  49%|████▊     | 971/2002 [04:17<04:20,  3.96it/s]

{'avg_chunks': 2.6, 'loss': 8.602340793609619, 'learning_rate': 0.051498501498501506, 'optimizer_steps': 971}


Train Iteration:  49%|████▉     | 976/2002 [04:19<04:36,  3.72it/s]

{'avg_chunks': 2.6, 'loss': 9.442164993286132, 'learning_rate': 0.05124875124875125, 'optimizer_steps': 976}


Train Iteration:  49%|████▉     | 981/2002 [04:20<04:28,  3.80it/s]

{'avg_chunks': 2.6, 'loss': 9.637627792358398, 'learning_rate': 0.050999000999001, 'optimizer_steps': 981}


Train Iteration:  49%|████▉     | 986/2002 [04:21<04:03,  4.18it/s]

{'avg_chunks': 2.0, 'loss': 8.903835487365722, 'learning_rate': 0.050749250749250754, 'optimizer_steps': 986}


Train Iteration:  50%|████▉     | 991/2002 [04:22<04:28,  3.77it/s]

{'avg_chunks': 3.0, 'loss': 9.156488609313964, 'learning_rate': 0.050499500499500506, 'optimizer_steps': 991}


Train Iteration:  50%|████▉     | 996/2002 [04:24<05:24,  3.10it/s]

{'avg_chunks': 3.0, 'loss': 9.199807357788085, 'learning_rate': 0.05024975024975025, 'optimizer_steps': 996}


Train Iteration:  50%|█████     | 1001/2002 [04:25<04:25,  3.78it/s]

{'avg_chunks': 2.8, 'loss': 8.891701698303223, 'learning_rate': 0.05, 'optimizer_steps': 1001}


Train Iteration:  50%|█████     | 1006/2002 [04:27<05:34,  2.98it/s]

{'avg_chunks': 3.4, 'loss': 8.902868270874023, 'learning_rate': 0.04975024975024975, 'optimizer_steps': 1006}


Train Iteration:  50%|█████     | 1011/2002 [04:28<04:22,  3.77it/s]

{'avg_chunks': 2.4, 'loss': 8.952805995941162, 'learning_rate': 0.049500499500499506, 'optimizer_steps': 1011}


Train Iteration:  51%|█████     | 1016/2002 [04:29<03:36,  4.55it/s]

{'avg_chunks': 2.0, 'loss': 9.232447052001953, 'learning_rate': 0.04925074925074925, 'optimizer_steps': 1016}


Train Iteration:  51%|█████     | 1021/2002 [04:31<04:33,  3.59it/s]

{'avg_chunks': 2.8, 'loss': 9.154146194458008, 'learning_rate': 0.049000999000999, 'optimizer_steps': 1021}


Train Iteration:  51%|█████     | 1026/2002 [04:32<04:29,  3.62it/s]

{'avg_chunks': 2.6, 'loss': 9.270980262756348, 'learning_rate': 0.048751248751248755, 'optimizer_steps': 1026}


Train Iteration:  51%|█████▏    | 1031/2002 [04:34<04:51,  3.34it/s]

{'avg_chunks': 3.2, 'loss': 9.035742378234863, 'learning_rate': 0.048501498501498506, 'optimizer_steps': 1031}


Train Iteration:  52%|█████▏    | 1036/2002 [04:35<04:01,  4.00it/s]

{'avg_chunks': 2.4, 'loss': 9.352181816101075, 'learning_rate': 0.04825174825174825, 'optimizer_steps': 1036}


Train Iteration:  52%|█████▏    | 1041/2002 [04:36<04:35,  3.49it/s]

{'avg_chunks': 2.6, 'loss': 9.205619812011719, 'learning_rate': 0.048001998001998, 'optimizer_steps': 1041}


Train Iteration:  52%|█████▏    | 1047/2002 [04:38<03:43,  4.28it/s]

{'avg_chunks': 2.4, 'loss': 9.091753578186035, 'learning_rate': 0.047752247752247755, 'optimizer_steps': 1046}


Train Iteration:  52%|█████▏    | 1051/2002 [04:39<03:45,  4.22it/s]

{'avg_chunks': 2.2, 'loss': 8.673018074035644, 'learning_rate': 0.04750249750249751, 'optimizer_steps': 1051}


Train Iteration:  53%|█████▎    | 1056/2002 [04:40<03:28,  4.54it/s]

{'avg_chunks': 2.2, 'loss': 9.607015991210938, 'learning_rate': 0.04725274725274725, 'optimizer_steps': 1056}


Train Iteration:  53%|█████▎    | 1061/2002 [04:41<03:47,  4.14it/s]

{'avg_chunks': 2.4, 'loss': 8.935653495788575, 'learning_rate': 0.04700299700299701, 'optimizer_steps': 1061}


Train Iteration:  53%|█████▎    | 1066/2002 [04:42<03:22,  4.61it/s]

{'avg_chunks': 2.0, 'loss': 8.915045547485352, 'learning_rate': 0.046753246753246755, 'optimizer_steps': 1066}


Train Iteration:  53%|█████▎    | 1071/2002 [04:44<04:20,  3.58it/s]

{'avg_chunks': 3.0, 'loss': 8.641187858581542, 'learning_rate': 0.04650349650349651, 'optimizer_steps': 1071}


Train Iteration:  54%|█████▎    | 1076/2002 [04:45<04:10,  3.69it/s]

{'avg_chunks': 2.6, 'loss': 8.72311372756958, 'learning_rate': 0.04625374625374626, 'optimizer_steps': 1076}


Train Iteration:  54%|█████▍    | 1081/2002 [04:46<04:02,  3.80it/s]

{'avg_chunks': 2.8, 'loss': 8.684758377075195, 'learning_rate': 0.04600399600399601, 'optimizer_steps': 1081}


Train Iteration:  54%|█████▍    | 1086/2002 [04:47<03:35,  4.24it/s]

{'avg_chunks': 2.4, 'loss': 9.147875785827637, 'learning_rate': 0.045754245754245755, 'optimizer_steps': 1086}


Train Iteration:  54%|█████▍    | 1091/2002 [04:49<04:22,  3.47it/s]

{'avg_chunks': 2.8, 'loss': 8.404598808288574, 'learning_rate': 0.04550449550449551, 'optimizer_steps': 1091}


Train Iteration:  55%|█████▍    | 1096/2002 [04:50<03:30,  4.31it/s]

{'avg_chunks': 2.2, 'loss': 9.338799667358398, 'learning_rate': 0.04525474525474526, 'optimizer_steps': 1096}


Train Iteration:  55%|█████▍    | 1101/2002 [04:51<04:00,  3.75it/s]

{'avg_chunks': 2.8, 'loss': 8.821753120422363, 'learning_rate': 0.04500499500499501, 'optimizer_steps': 1101}


Train Iteration:  55%|█████▌    | 1106/2002 [04:53<04:37,  3.23it/s]

{'avg_chunks': 3.0, 'loss': 8.460673522949218, 'learning_rate': 0.044755244755244755, 'optimizer_steps': 1106}


Train Iteration:  56%|█████▌    | 1112/2002 [04:55<03:31,  4.20it/s]

{'avg_chunks': 2.8, 'loss': 9.223028373718261, 'learning_rate': 0.04450549450549451, 'optimizer_steps': 1111}


Train Iteration:  56%|█████▌    | 1116/2002 [04:56<03:35,  4.11it/s]

{'avg_chunks': 2.2, 'loss': 8.384595966339111, 'learning_rate': 0.04425574425574426, 'optimizer_steps': 1116}


Train Iteration:  56%|█████▌    | 1121/2002 [04:57<04:30,  3.25it/s]

{'avg_chunks': 2.8, 'loss': 8.424079895019531, 'learning_rate': 0.044005994005994004, 'optimizer_steps': 1121}


Train Iteration:  56%|█████▌    | 1126/2002 [04:58<03:47,  3.85it/s]

{'avg_chunks': 2.2, 'loss': 9.0718337059021, 'learning_rate': 0.04375624375624376, 'optimizer_steps': 1126}


Train Iteration:  56%|█████▋    | 1131/2002 [04:59<03:10,  4.57it/s]

{'avg_chunks': 2.0, 'loss': 8.823646736145019, 'learning_rate': 0.04350649350649351, 'optimizer_steps': 1131}


Train Iteration:  57%|█████▋    | 1136/2002 [05:01<03:30,  4.12it/s]

{'avg_chunks': 2.6, 'loss': 8.538280010223389, 'learning_rate': 0.04325674325674326, 'optimizer_steps': 1136}


Train Iteration:  57%|█████▋    | 1142/2002 [05:02<03:14,  4.42it/s]

{'avg_chunks': 2.8, 'loss': 8.698381423950195, 'learning_rate': 0.04300699300699301, 'optimizer_steps': 1141}


Train Iteration:  57%|█████▋    | 1146/2002 [05:03<03:40,  3.88it/s]

{'avg_chunks': 2.0, 'loss': 8.689157009124756, 'learning_rate': 0.04275724275724276, 'optimizer_steps': 1146}


Train Iteration:  57%|█████▋    | 1151/2002 [05:04<03:19,  4.26it/s]

{'avg_chunks': 2.2, 'loss': 8.402375602722168, 'learning_rate': 0.04250749250749251, 'optimizer_steps': 1151}


Train Iteration:  58%|█████▊    | 1156/2002 [05:06<03:13,  4.37it/s]

{'avg_chunks': 2.4, 'loss': 7.976721096038818, 'learning_rate': 0.04225774225774226, 'optimizer_steps': 1156}


Train Iteration:  58%|█████▊    | 1161/2002 [05:07<03:11,  4.40it/s]

{'avg_chunks': 2.0, 'loss': 8.900405120849609, 'learning_rate': 0.04200799200799201, 'optimizer_steps': 1161}


Train Iteration:  58%|█████▊    | 1166/2002 [05:08<03:32,  3.94it/s]

{'avg_chunks': 2.6, 'loss': 7.958880043029785, 'learning_rate': 0.04175824175824176, 'optimizer_steps': 1166}


Train Iteration:  58%|█████▊    | 1171/2002 [05:09<03:10,  4.35it/s]

{'avg_chunks': 2.0, 'loss': 8.716488933563232, 'learning_rate': 0.04150849150849151, 'optimizer_steps': 1171}


Train Iteration:  59%|█████▊    | 1176/2002 [05:10<03:33,  3.87it/s]

{'avg_chunks': 2.6, 'loss': 9.194646072387695, 'learning_rate': 0.041258741258741266, 'optimizer_steps': 1176}


Train Iteration:  59%|█████▉    | 1181/2002 [05:12<04:03,  3.37it/s]

{'avg_chunks': 3.6, 'loss': 8.349621200561524, 'learning_rate': 0.04100899100899101, 'optimizer_steps': 1181}


Train Iteration:  59%|█████▉    | 1186/2002 [05:13<02:51,  4.75it/s]

{'avg_chunks': 1.8, 'loss': 8.876216793060303, 'learning_rate': 0.04075924075924076, 'optimizer_steps': 1186}


Train Iteration:  59%|█████▉    | 1191/2002 [05:14<02:55,  4.61it/s]

{'avg_chunks': 2.2, 'loss': 8.634856796264648, 'learning_rate': 0.040509490509490514, 'optimizer_steps': 1191}


Train Iteration:  60%|█████▉    | 1196/2002 [05:16<03:35,  3.74it/s]

{'avg_chunks': 3.2, 'loss': 7.938406562805175, 'learning_rate': 0.040259740259740266, 'optimizer_steps': 1196}


Train Iteration:  60%|█████▉    | 1201/2002 [05:17<03:25,  3.89it/s]

{'avg_chunks': 2.6, 'loss': 8.396346187591552, 'learning_rate': 0.04000999000999001, 'optimizer_steps': 1201}


Train Iteration:  60%|██████    | 1206/2002 [05:19<03:54,  3.39it/s]

{'avg_chunks': 2.8, 'loss': 8.169697189331055, 'learning_rate': 0.03976023976023976, 'optimizer_steps': 1206}


Train Iteration:  60%|██████    | 1211/2002 [05:20<03:05,  4.27it/s]

{'avg_chunks': 2.2, 'loss': 8.80567741394043, 'learning_rate': 0.039510489510489515, 'optimizer_steps': 1211}


Train Iteration:  61%|██████    | 1216/2002 [05:21<03:08,  4.18it/s]

{'avg_chunks': 2.4, 'loss': 8.83835048675537, 'learning_rate': 0.03926073926073926, 'optimizer_steps': 1216}


Train Iteration:  61%|██████    | 1221/2002 [05:23<03:17,  3.95it/s]

{'avg_chunks': 2.6, 'loss': 8.748218536376953, 'learning_rate': 0.03901098901098901, 'optimizer_steps': 1221}


Train Iteration:  61%|██████    | 1226/2002 [05:24<03:04,  4.21it/s]

{'avg_chunks': 2.2, 'loss': 8.700663948059082, 'learning_rate': 0.03876123876123876, 'optimizer_steps': 1226}


Train Iteration:  61%|██████▏   | 1231/2002 [05:25<03:18,  3.88it/s]

{'avg_chunks': 2.8, 'loss': 7.922138118743897, 'learning_rate': 0.038511488511488515, 'optimizer_steps': 1231}


Train Iteration:  62%|██████▏   | 1236/2002 [05:27<03:05,  4.13it/s]

{'avg_chunks': 2.4, 'loss': 8.264333343505859, 'learning_rate': 0.03826173826173826, 'optimizer_steps': 1236}


Train Iteration:  62%|██████▏   | 1241/2002 [05:28<03:06,  4.07it/s]

{'avg_chunks': 2.4, 'loss': 8.944594860076904, 'learning_rate': 0.03801198801198802, 'optimizer_steps': 1241}


Train Iteration:  62%|██████▏   | 1246/2002 [05:29<02:57,  4.25it/s]

{'avg_chunks': 2.4, 'loss': 8.374634265899658, 'learning_rate': 0.03776223776223776, 'optimizer_steps': 1246}


Train Iteration:  62%|██████▏   | 1251/2002 [05:30<03:19,  3.77it/s]

{'avg_chunks': 2.6, 'loss': 8.484635543823241, 'learning_rate': 0.037512487512487515, 'optimizer_steps': 1251}


Train Iteration:  63%|██████▎   | 1256/2002 [05:31<03:01,  4.10it/s]

{'avg_chunks': 2.2, 'loss': 8.166665077209473, 'learning_rate': 0.03726273726273726, 'optimizer_steps': 1256}


Train Iteration:  63%|██████▎   | 1261/2002 [05:34<05:01,  2.46it/s]

{'avg_chunks': 4.0, 'loss': 8.890573120117187, 'learning_rate': 0.03701298701298702, 'optimizer_steps': 1261}


Train Iteration:  63%|██████▎   | 1266/2002 [05:36<03:15,  3.77it/s]

{'avg_chunks': 2.4, 'loss': 8.998627281188964, 'learning_rate': 0.03676323676323676, 'optimizer_steps': 1266}


Train Iteration:  63%|██████▎   | 1271/2002 [05:37<03:10,  3.83it/s]

{'avg_chunks': 2.8, 'loss': 8.18558177947998, 'learning_rate': 0.036513486513486515, 'optimizer_steps': 1271}


Train Iteration:  64%|██████▎   | 1276/2002 [05:39<03:38,  3.32it/s]

{'avg_chunks': 3.2, 'loss': 8.321831035614014, 'learning_rate': 0.03626373626373627, 'optimizer_steps': 1276}


Train Iteration:  64%|██████▍   | 1281/2002 [05:40<03:39,  3.28it/s]

{'avg_chunks': 2.8, 'loss': 8.52434949874878, 'learning_rate': 0.03601398601398602, 'optimizer_steps': 1281}


Train Iteration:  64%|██████▍   | 1286/2002 [05:41<02:57,  4.03it/s]

{'avg_chunks': 2.4, 'loss': 8.372791481018066, 'learning_rate': 0.03576423576423576, 'optimizer_steps': 1286}


Train Iteration:  64%|██████▍   | 1291/2002 [05:43<03:29,  3.40it/s]

{'avg_chunks': 3.2, 'loss': 8.62248125076294, 'learning_rate': 0.035514485514485515, 'optimizer_steps': 1291}


Train Iteration:  65%|██████▍   | 1296/2002 [05:44<03:16,  3.60it/s]

{'avg_chunks': 3.0, 'loss': 8.454168891906738, 'learning_rate': 0.03526473526473527, 'optimizer_steps': 1296}


Train Iteration:  65%|██████▍   | 1301/2002 [05:46<03:49,  3.06it/s]

{'avg_chunks': 2.8, 'loss': 8.541555786132813, 'learning_rate': 0.03501498501498502, 'optimizer_steps': 1301}


Train Iteration:  65%|██████▌   | 1306/2002 [05:47<02:57,  3.92it/s]

{'avg_chunks': 2.2, 'loss': 8.391809558868408, 'learning_rate': 0.034765234765234763, 'optimizer_steps': 1306}


Train Iteration:  65%|██████▌   | 1311/2002 [05:48<03:21,  3.43it/s]

{'avg_chunks': 2.8, 'loss': 8.358241176605224, 'learning_rate': 0.03451548451548452, 'optimizer_steps': 1311}


Train Iteration:  66%|██████▌   | 1316/2002 [05:50<02:57,  3.88it/s]

{'avg_chunks': 2.6, 'loss': 8.228112411499023, 'learning_rate': 0.03426573426573427, 'optimizer_steps': 1316}


Train Iteration:  66%|██████▌   | 1321/2002 [05:51<03:13,  3.51it/s]

{'avg_chunks': 2.8, 'loss': 8.638222980499268, 'learning_rate': 0.03401598401598401, 'optimizer_steps': 1321}


Train Iteration:  66%|██████▌   | 1326/2002 [05:53<03:02,  3.71it/s]

{'avg_chunks': 3.0, 'loss': 8.138371181488036, 'learning_rate': 0.03376623376623377, 'optimizer_steps': 1326}


Train Iteration:  66%|██████▋   | 1331/2002 [05:54<02:29,  4.48it/s]

{'avg_chunks': 2.0, 'loss': 9.050588226318359, 'learning_rate': 0.033516483516483515, 'optimizer_steps': 1331}


Train Iteration:  67%|██████▋   | 1336/2002 [05:55<02:52,  3.86it/s]

{'avg_chunks': 2.6, 'loss': 8.526128768920898, 'learning_rate': 0.03326673326673327, 'optimizer_steps': 1336}


Train Iteration:  67%|██████▋   | 1341/2002 [05:56<02:56,  3.75it/s]

{'avg_chunks': 2.6, 'loss': 8.49636344909668, 'learning_rate': 0.03301698301698302, 'optimizer_steps': 1341}


Train Iteration:  67%|██████▋   | 1346/2002 [05:58<02:59,  3.66it/s]

{'avg_chunks': 3.0, 'loss': 8.363408088684082, 'learning_rate': 0.03276723276723277, 'optimizer_steps': 1346}


Train Iteration:  67%|██████▋   | 1351/2002 [05:59<02:50,  3.82it/s]

{'avg_chunks': 2.4, 'loss': 8.868661308288575, 'learning_rate': 0.032517482517482516, 'optimizer_steps': 1351}


Train Iteration:  68%|██████▊   | 1356/2002 [06:00<02:37,  4.09it/s]

{'avg_chunks': 2.6, 'loss': 8.730056858062744, 'learning_rate': 0.03226773226773227, 'optimizer_steps': 1356}


Train Iteration:  68%|██████▊   | 1361/2002 [06:02<02:36,  4.09it/s]

{'avg_chunks': 2.2, 'loss': 8.73804006576538, 'learning_rate': 0.03201798201798202, 'optimizer_steps': 1361}


Train Iteration:  68%|██████▊   | 1366/2002 [06:03<02:26,  4.35it/s]

{'avg_chunks': 2.4, 'loss': 8.124489784240723, 'learning_rate': 0.03176823176823177, 'optimizer_steps': 1366}


Train Iteration:  68%|██████▊   | 1371/2002 [06:04<02:34,  4.09it/s]

{'avg_chunks': 2.4, 'loss': 8.582799339294434, 'learning_rate': 0.031518481518481516, 'optimizer_steps': 1371}


Train Iteration:  69%|██████▊   | 1376/2002 [06:05<02:29,  4.20it/s]

{'avg_chunks': 2.6, 'loss': 8.006237888336182, 'learning_rate': 0.031268731268731274, 'optimizer_steps': 1376}


Train Iteration:  69%|██████▉   | 1381/2002 [06:07<02:41,  3.85it/s]

{'avg_chunks': 2.6, 'loss': 8.682192420959472, 'learning_rate': 0.03101898101898102, 'optimizer_steps': 1381}


Train Iteration:  69%|██████▉   | 1387/2002 [06:08<02:09,  4.75it/s]

{'avg_chunks': 2.4, 'loss': 8.731008720397949, 'learning_rate': 0.03076923076923077, 'optimizer_steps': 1386}


Train Iteration:  69%|██████▉   | 1391/2002 [06:09<02:09,  4.71it/s]

{'avg_chunks': 1.8, 'loss': 8.191365814208984, 'learning_rate': 0.03051948051948052, 'optimizer_steps': 1391}


Train Iteration:  70%|██████▉   | 1396/2002 [06:10<02:29,  4.06it/s]

{'avg_chunks': 2.4, 'loss': 8.235101890563964, 'learning_rate': 0.030269730269730274, 'optimizer_steps': 1396}


Train Iteration:  70%|██████▉   | 1401/2002 [06:12<02:34,  3.89it/s]

{'avg_chunks': 2.8, 'loss': 8.321226119995117, 'learning_rate': 0.03001998001998002, 'optimizer_steps': 1401}


Train Iteration:  70%|███████   | 1406/2002 [06:13<02:28,  4.01it/s]

{'avg_chunks': 2.4, 'loss': 8.139438819885253, 'learning_rate': 0.029770229770229775, 'optimizer_steps': 1406}


Train Iteration:  70%|███████   | 1411/2002 [06:14<02:06,  4.69it/s]

{'avg_chunks': 2.0, 'loss': 8.292344665527343, 'learning_rate': 0.029520479520479523, 'optimizer_steps': 1411}


Train Iteration:  71%|███████   | 1416/2002 [06:15<02:07,  4.61it/s]

{'avg_chunks': 2.2, 'loss': 8.371468830108643, 'learning_rate': 0.029270729270729275, 'optimizer_steps': 1416}


Train Iteration:  71%|███████   | 1421/2002 [06:16<02:18,  4.18it/s]

{'avg_chunks': 2.4, 'loss': 8.225134372711182, 'learning_rate': 0.029020979020979023, 'optimizer_steps': 1421}


Train Iteration:  71%|███████   | 1426/2002 [06:18<02:11,  4.39it/s]

{'avg_chunks': 2.2, 'loss': 8.0686541557312, 'learning_rate': 0.028771228771228775, 'optimizer_steps': 1426}


Train Iteration:  71%|███████▏  | 1431/2002 [06:19<02:28,  3.85it/s]

{'avg_chunks': 2.8, 'loss': 8.721430492401122, 'learning_rate': 0.028521478521478523, 'optimizer_steps': 1431}


Train Iteration:  72%|███████▏  | 1436/2002 [06:20<02:33,  3.69it/s]

{'avg_chunks': 2.6, 'loss': 8.004640197753906, 'learning_rate': 0.02827172827172827, 'optimizer_steps': 1436}


Train Iteration:  72%|███████▏  | 1441/2002 [06:21<02:08,  4.36it/s]

{'avg_chunks': 2.2, 'loss': 8.209486961364746, 'learning_rate': 0.028021978021978023, 'optimizer_steps': 1441}


Train Iteration:  72%|███████▏  | 1447/2002 [06:23<02:06,  4.39it/s]

{'avg_chunks': 2.8, 'loss': 8.121831798553467, 'learning_rate': 0.02777222777222777, 'optimizer_steps': 1446}


Train Iteration:  72%|███████▏  | 1451/2002 [06:24<02:15,  4.07it/s]

{'avg_chunks': 2.2, 'loss': 7.739742374420166, 'learning_rate': 0.027522477522477523, 'optimizer_steps': 1451}


Train Iteration:  73%|███████▎  | 1456/2002 [06:25<02:05,  4.33it/s]

{'avg_chunks': 2.2, 'loss': 8.002476692199707, 'learning_rate': 0.02727272727272727, 'optimizer_steps': 1456}


Train Iteration:  73%|███████▎  | 1461/2002 [06:26<01:56,  4.64it/s]

{'avg_chunks': 2.0, 'loss': 8.65875015258789, 'learning_rate': 0.027022977022977027, 'optimizer_steps': 1461}


Train Iteration:  73%|███████▎  | 1466/2002 [06:27<01:57,  4.56it/s]

{'avg_chunks': 2.2, 'loss': 8.243315315246582, 'learning_rate': 0.02677322677322677, 'optimizer_steps': 1466}


Train Iteration:  73%|███████▎  | 1471/2002 [06:29<02:03,  4.31it/s]

{'avg_chunks': 2.4, 'loss': 8.290288734436036, 'learning_rate': 0.026523476523476527, 'optimizer_steps': 1471}


Train Iteration:  74%|███████▎  | 1476/2002 [06:30<02:06,  4.17it/s]

{'avg_chunks': 2.4, 'loss': 7.472724723815918, 'learning_rate': 0.026273726273726275, 'optimizer_steps': 1476}


Train Iteration:  74%|███████▍  | 1481/2002 [06:31<02:12,  3.94it/s]

{'avg_chunks': 2.4, 'loss': 7.695080757141113, 'learning_rate': 0.026023976023976027, 'optimizer_steps': 1481}


Train Iteration:  74%|███████▍  | 1486/2002 [06:32<02:19,  3.70it/s]

{'avg_chunks': 2.8, 'loss': 8.122010707855225, 'learning_rate': 0.025774225774225775, 'optimizer_steps': 1486}


Train Iteration:  74%|███████▍  | 1491/2002 [06:34<02:12,  3.84it/s]

{'avg_chunks': 2.6, 'loss': 7.701584529876709, 'learning_rate': 0.025524475524475527, 'optimizer_steps': 1491}


Train Iteration:  75%|███████▍  | 1496/2002 [06:35<02:07,  3.98it/s]

{'avg_chunks': 2.6, 'loss': 7.830912399291992, 'learning_rate': 0.025274725274725275, 'optimizer_steps': 1496}


Train Iteration:  75%|███████▍  | 1501/2002 [06:36<02:00,  4.15it/s]

{'avg_chunks': 2.4, 'loss': 8.165190029144288, 'learning_rate': 0.025024975024975027, 'optimizer_steps': 1501}


Train Iteration:  75%|███████▌  | 1506/2002 [06:38<02:13,  3.72it/s]

{'avg_chunks': 3.0, 'loss': 8.41130838394165, 'learning_rate': 0.024775224775224775, 'optimizer_steps': 1506}


Train Iteration:  75%|███████▌  | 1511/2002 [06:39<02:23,  3.41it/s]

{'avg_chunks': 2.8, 'loss': 8.476311302185058, 'learning_rate': 0.024525474525474527, 'optimizer_steps': 1511}


Train Iteration:  76%|███████▌  | 1517/2002 [06:41<01:46,  4.57it/s]

{'avg_chunks': 2.4, 'loss': 7.889234638214111, 'learning_rate': 0.024275724275724275, 'optimizer_steps': 1516}


Train Iteration:  76%|███████▌  | 1521/2002 [06:42<01:49,  4.38it/s]

{'avg_chunks': 2.0, 'loss': 8.16577081680298, 'learning_rate': 0.024025974025974027, 'optimizer_steps': 1521}


Train Iteration:  76%|███████▌  | 1526/2002 [06:43<02:14,  3.53it/s]

{'avg_chunks': 2.8, 'loss': 7.8025288581848145, 'learning_rate': 0.02377622377622378, 'optimizer_steps': 1526}


Train Iteration:  76%|███████▋  | 1531/2002 [06:44<02:07,  3.69it/s]

{'avg_chunks': 2.6, 'loss': 8.061491298675538, 'learning_rate': 0.023526473526473527, 'optimizer_steps': 1531}


Train Iteration:  77%|███████▋  | 1536/2002 [06:45<01:42,  4.53it/s]

{'avg_chunks': 2.0, 'loss': 7.969303894042969, 'learning_rate': 0.02327672327672328, 'optimizer_steps': 1536}


Train Iteration:  77%|███████▋  | 1541/2002 [06:47<01:51,  4.13it/s]

{'avg_chunks': 2.6, 'loss': 8.367321872711182, 'learning_rate': 0.023026973026973027, 'optimizer_steps': 1541}


Train Iteration:  77%|███████▋  | 1546/2002 [06:48<02:03,  3.70it/s]

{'avg_chunks': 2.8, 'loss': 7.468879318237304, 'learning_rate': 0.02277722277722278, 'optimizer_steps': 1546}


Train Iteration:  77%|███████▋  | 1551/2002 [06:49<02:08,  3.52it/s]

{'avg_chunks': 2.6, 'loss': 7.752832889556885, 'learning_rate': 0.02252747252747253, 'optimizer_steps': 1551}


Train Iteration:  78%|███████▊  | 1556/2002 [06:51<01:37,  4.56it/s]

{'avg_chunks': 2.2, 'loss': 8.241565227508545, 'learning_rate': 0.02227772227772228, 'optimizer_steps': 1556}


Train Iteration:  78%|███████▊  | 1561/2002 [06:52<01:41,  4.35it/s]

{'avg_chunks': 2.2, 'loss': 8.327011203765869, 'learning_rate': 0.02202797202797203, 'optimizer_steps': 1561}


Train Iteration:  78%|███████▊  | 1566/2002 [06:53<02:01,  3.60it/s]

{'avg_chunks': 2.8, 'loss': 8.106426429748534, 'learning_rate': 0.02177822177822178, 'optimizer_steps': 1566}


Train Iteration:  78%|███████▊  | 1571/2002 [06:54<01:31,  4.70it/s]

{'avg_chunks': 2.0, 'loss': 8.378828430175782, 'learning_rate': 0.02152847152847153, 'optimizer_steps': 1571}


Train Iteration:  79%|███████▉  | 1577/2002 [06:56<01:33,  4.53it/s]

{'avg_chunks': 2.4, 'loss': 7.871630382537842, 'learning_rate': 0.021278721278721283, 'optimizer_steps': 1576}


Train Iteration:  79%|███████▉  | 1581/2002 [06:57<01:36,  4.37it/s]

{'avg_chunks': 2.0, 'loss': 8.340756702423096, 'learning_rate': 0.02102897102897103, 'optimizer_steps': 1581}


Train Iteration:  79%|███████▉  | 1586/2002 [06:58<01:37,  4.27it/s]

{'avg_chunks': 2.4, 'loss': 8.06197214126587, 'learning_rate': 0.020779220779220783, 'optimizer_steps': 1586}


Train Iteration:  79%|███████▉  | 1591/2002 [06:59<01:31,  4.52it/s]

{'avg_chunks': 2.2, 'loss': 8.136255645751953, 'learning_rate': 0.02052947052947053, 'optimizer_steps': 1591}


Train Iteration:  80%|███████▉  | 1596/2002 [07:00<01:41,  4.00it/s]

{'avg_chunks': 2.6, 'loss': 8.038469982147216, 'learning_rate': 0.02027972027972028, 'optimizer_steps': 1596}


Train Iteration:  80%|███████▉  | 1601/2002 [07:02<01:43,  3.87it/s]

{'avg_chunks': 2.6, 'loss': 7.866788291931153, 'learning_rate': 0.02002997002997003, 'optimizer_steps': 1601}


Train Iteration:  80%|████████  | 1607/2002 [07:03<01:29,  4.40it/s]

{'avg_chunks': 2.4, 'loss': 8.30533504486084, 'learning_rate': 0.01978021978021978, 'optimizer_steps': 1606}


Train Iteration:  80%|████████  | 1611/2002 [07:04<01:30,  4.34it/s]

{'avg_chunks': 1.8, 'loss': 8.484454536437989, 'learning_rate': 0.01953046953046953, 'optimizer_steps': 1611}


Train Iteration:  81%|████████  | 1616/2002 [07:05<01:43,  3.74it/s]

{'avg_chunks': 2.8, 'loss': 8.077779293060303, 'learning_rate': 0.019280719280719283, 'optimizer_steps': 1616}


Train Iteration:  81%|████████  | 1621/2002 [07:07<01:48,  3.50it/s]

{'avg_chunks': 3.0, 'loss': 7.87253360748291, 'learning_rate': 0.01903096903096903, 'optimizer_steps': 1621}


Train Iteration:  81%|████████  | 1626/2002 [07:08<01:31,  4.11it/s]

{'avg_chunks': 2.4, 'loss': 8.178200244903564, 'learning_rate': 0.018781218781218783, 'optimizer_steps': 1626}


Train Iteration:  81%|████████▏ | 1631/2002 [07:10<02:07,  2.91it/s]

{'avg_chunks': 3.6, 'loss': 7.778977108001709, 'learning_rate': 0.01853146853146853, 'optimizer_steps': 1631}


Train Iteration:  82%|████████▏ | 1636/2002 [07:11<01:25,  4.29it/s]

{'avg_chunks': 2.0, 'loss': 7.95693359375, 'learning_rate': 0.018281718281718283, 'optimizer_steps': 1636}


Train Iteration:  82%|████████▏ | 1641/2002 [07:12<01:17,  4.66it/s]

{'avg_chunks': 2.0, 'loss': 8.340839385986328, 'learning_rate': 0.01803196803196803, 'optimizer_steps': 1641}


Train Iteration:  82%|████████▏ | 1646/2002 [07:13<01:18,  4.56it/s]

{'avg_chunks': 2.2, 'loss': 8.023741054534913, 'learning_rate': 0.017782217782217783, 'optimizer_steps': 1646}


Train Iteration:  82%|████████▏ | 1651/2002 [07:14<01:27,  4.00it/s]

{'avg_chunks': 2.6, 'loss': 7.957818698883057, 'learning_rate': 0.017532467532467535, 'optimizer_steps': 1651}


Train Iteration:  83%|████████▎ | 1656/2002 [07:16<01:58,  2.93it/s]

{'avg_chunks': 3.0, 'loss': 8.206096267700195, 'learning_rate': 0.017282717282717283, 'optimizer_steps': 1656}


Train Iteration:  83%|████████▎ | 1661/2002 [07:17<01:23,  4.07it/s]

{'avg_chunks': 2.2, 'loss': 7.107600116729737, 'learning_rate': 0.017032967032967035, 'optimizer_steps': 1661}


Train Iteration:  83%|████████▎ | 1666/2002 [07:20<02:56,  1.90it/s]

{'avg_chunks': 5.0, 'loss': 7.83282470703125, 'learning_rate': 0.016783216783216783, 'optimizer_steps': 1666}


Train Iteration:  84%|████████▎ | 1672/2002 [07:21<01:28,  3.72it/s]

{'avg_chunks': 2.6, 'loss': 7.378256511688233, 'learning_rate': 0.016533466533466535, 'optimizer_steps': 1671}


Train Iteration:  84%|████████▎ | 1676/2002 [07:22<01:17,  4.23it/s]

{'avg_chunks': 2.0, 'loss': 8.514235687255859, 'learning_rate': 0.016283716283716287, 'optimizer_steps': 1676}


Train Iteration:  84%|████████▍ | 1681/2002 [07:23<01:09,  4.65it/s]

{'avg_chunks': 2.0, 'loss': 8.183285522460938, 'learning_rate': 0.016033966033966035, 'optimizer_steps': 1681}


Train Iteration:  84%|████████▍ | 1686/2002 [07:25<01:28,  3.58it/s]

{'avg_chunks': 2.8, 'loss': 7.643351745605469, 'learning_rate': 0.015784215784215787, 'optimizer_steps': 1686}


Train Iteration:  84%|████████▍ | 1691/2002 [07:26<01:54,  2.71it/s]

{'avg_chunks': 3.0, 'loss': 7.957393550872803, 'learning_rate': 0.015534465534465537, 'optimizer_steps': 1691}


Train Iteration:  85%|████████▍ | 1697/2002 [07:28<01:13,  4.17it/s]

{'avg_chunks': 2.6, 'loss': 8.017033290863036, 'learning_rate': 0.015284715284715283, 'optimizer_steps': 1696}


Train Iteration:  85%|████████▍ | 1701/2002 [07:29<01:17,  3.87it/s]

{'avg_chunks': 2.2, 'loss': 7.602390480041504, 'learning_rate': 0.015034965034965035, 'optimizer_steps': 1701}


Train Iteration:  85%|████████▌ | 1706/2002 [07:30<01:12,  4.10it/s]

{'avg_chunks': 2.6, 'loss': 7.912553596496582, 'learning_rate': 0.014785214785214785, 'optimizer_steps': 1706}


Train Iteration:  85%|████████▌ | 1711/2002 [07:31<01:06,  4.39it/s]

{'avg_chunks': 2.2, 'loss': 7.989857959747314, 'learning_rate': 0.014535464535464535, 'optimizer_steps': 1711}


Train Iteration:  86%|████████▌ | 1717/2002 [07:33<00:59,  4.82it/s]

{'avg_chunks': 2.2, 'loss': 7.839084529876709, 'learning_rate': 0.014285714285714285, 'optimizer_steps': 1716}


Train Iteration:  86%|████████▌ | 1721/2002 [07:34<01:11,  3.95it/s]

{'avg_chunks': 2.2, 'loss': 7.358389472961425, 'learning_rate': 0.014035964035964035, 'optimizer_steps': 1721}


Train Iteration:  86%|████████▌ | 1726/2002 [07:35<00:58,  4.70it/s]

{'avg_chunks': 2.0, 'loss': 7.89564962387085, 'learning_rate': 0.013786213786213787, 'optimizer_steps': 1726}


Train Iteration:  86%|████████▋ | 1731/2002 [07:36<01:11,  3.80it/s]

{'avg_chunks': 2.4, 'loss': 8.119508361816406, 'learning_rate': 0.013536463536463537, 'optimizer_steps': 1731}


Train Iteration:  87%|████████▋ | 1736/2002 [07:37<01:08,  3.90it/s]

{'avg_chunks': 2.4, 'loss': 7.886468601226807, 'learning_rate': 0.013286713286713287, 'optimizer_steps': 1736}


Train Iteration:  87%|████████▋ | 1741/2002 [07:38<01:05,  3.97it/s]

{'avg_chunks': 2.6, 'loss': 7.734164237976074, 'learning_rate': 0.013036963036963037, 'optimizer_steps': 1741}


Train Iteration:  87%|████████▋ | 1746/2002 [07:40<01:00,  4.26it/s]

{'avg_chunks': 2.2, 'loss': 7.331593227386475, 'learning_rate': 0.012787212787212787, 'optimizer_steps': 1746}


Train Iteration:  87%|████████▋ | 1751/2002 [07:41<01:15,  3.34it/s]

{'avg_chunks': 3.0, 'loss': 7.742281913757324, 'learning_rate': 0.012537462537462539, 'optimizer_steps': 1751}


Train Iteration:  88%|████████▊ | 1757/2002 [07:43<01:10,  3.48it/s]

{'avg_chunks': 3.6, 'loss': 7.47286901473999, 'learning_rate': 0.012287712287712289, 'optimizer_steps': 1756}


Train Iteration:  88%|████████▊ | 1762/2002 [07:44<00:52,  4.54it/s]

{'avg_chunks': 2.0, 'loss': 7.631674766540527, 'learning_rate': 0.012037962037962039, 'optimizer_steps': 1761}


Train Iteration:  88%|████████▊ | 1766/2002 [07:45<00:52,  4.48it/s]

{'avg_chunks': 2.0, 'loss': 7.540378665924072, 'learning_rate': 0.011788211788211789, 'optimizer_steps': 1766}


Train Iteration:  88%|████████▊ | 1771/2002 [07:47<01:09,  3.30it/s]

{'avg_chunks': 3.0, 'loss': 7.767990207672119, 'learning_rate': 0.011538461538461539, 'optimizer_steps': 1771}


Train Iteration:  89%|████████▊ | 1776/2002 [07:48<00:57,  3.90it/s]

{'avg_chunks': 2.4, 'loss': 7.472438621520996, 'learning_rate': 0.01128871128871129, 'optimizer_steps': 1776}


Train Iteration:  89%|████████▉ | 1781/2002 [07:49<00:57,  3.87it/s]

{'avg_chunks': 2.4, 'loss': 7.620839691162109, 'learning_rate': 0.01103896103896104, 'optimizer_steps': 1781}


Train Iteration:  89%|████████▉ | 1787/2002 [07:50<00:39,  5.43it/s]

{'avg_chunks': 1.6, 'loss': 7.78159008026123, 'learning_rate': 0.01078921078921079, 'optimizer_steps': 1786}


Train Iteration:  89%|████████▉ | 1791/2002 [07:51<00:50,  4.17it/s]

{'avg_chunks': 2.4, 'loss': 7.87949504852295, 'learning_rate': 0.01053946053946054, 'optimizer_steps': 1791}


Train Iteration:  90%|████████▉ | 1797/2002 [07:53<00:52,  3.88it/s]

{'avg_chunks': 3.4, 'loss': 7.1341523170471195, 'learning_rate': 0.010289710289710291, 'optimizer_steps': 1796}


Train Iteration:  90%|████████▉ | 1801/2002 [07:54<00:50,  4.01it/s]

{'avg_chunks': 2.2, 'loss': 7.636790657043457, 'learning_rate': 0.010039960039960041, 'optimizer_steps': 1801}


Train Iteration:  90%|█████████ | 1806/2002 [07:56<00:51,  3.79it/s]

{'avg_chunks': 2.8, 'loss': 7.403612518310547, 'learning_rate': 0.009790209790209791, 'optimizer_steps': 1806}


Train Iteration:  90%|█████████ | 1811/2002 [07:57<01:03,  2.99it/s]

{'avg_chunks': 3.4, 'loss': 7.7509862899780275, 'learning_rate': 0.009540459540459541, 'optimizer_steps': 1811}


Train Iteration:  91%|█████████ | 1816/2002 [07:59<01:00,  3.05it/s]

{'avg_chunks': 3.4, 'loss': 7.246688270568848, 'learning_rate': 0.009290709290709291, 'optimizer_steps': 1816}


Train Iteration:  91%|█████████ | 1821/2002 [08:00<00:44,  4.05it/s]

{'avg_chunks': 2.2, 'loss': 7.178117561340332, 'learning_rate': 0.009040959040959041, 'optimizer_steps': 1821}


Train Iteration:  91%|█████████ | 1826/2002 [08:01<00:37,  4.66it/s]

{'avg_chunks': 2.0, 'loss': 7.798509216308593, 'learning_rate': 0.008791208791208793, 'optimizer_steps': 1826}


Train Iteration:  91%|█████████▏| 1831/2002 [08:03<00:45,  3.79it/s]

{'avg_chunks': 2.4, 'loss': 7.8615764617919925, 'learning_rate': 0.008541458541458541, 'optimizer_steps': 1831}


Train Iteration:  92%|█████████▏| 1837/2002 [08:04<00:35,  4.64it/s]

{'avg_chunks': 2.4, 'loss': 7.445543003082276, 'learning_rate': 0.008291708291708291, 'optimizer_steps': 1836}


Train Iteration:  92%|█████████▏| 1841/2002 [08:05<00:30,  5.20it/s]

{'avg_chunks': 1.6, 'loss': 7.37417516708374, 'learning_rate': 0.008041958041958041, 'optimizer_steps': 1841}


Train Iteration:  92%|█████████▏| 1846/2002 [08:06<00:36,  4.29it/s]

{'avg_chunks': 2.2, 'loss': 7.578171920776367, 'learning_rate': 0.007792207792207792, 'optimizer_steps': 1846}


Train Iteration:  92%|█████████▏| 1851/2002 [08:07<00:36,  4.16it/s]

{'avg_chunks': 2.2, 'loss': 7.531660747528076, 'learning_rate': 0.007542457542457543, 'optimizer_steps': 1851}


Train Iteration:  93%|█████████▎| 1857/2002 [08:08<00:32,  4.42it/s]

{'avg_chunks': 2.4, 'loss': 7.550947952270508, 'learning_rate': 0.007292707292707293, 'optimizer_steps': 1856}


Train Iteration:  93%|█████████▎| 1861/2002 [08:09<00:35,  3.97it/s]

{'avg_chunks': 2.2, 'loss': 7.470280456542969, 'learning_rate': 0.007042957042957043, 'optimizer_steps': 1861}


Train Iteration:  93%|█████████▎| 1866/2002 [08:11<00:35,  3.80it/s]

{'avg_chunks': 2.6, 'loss': 6.975150203704834, 'learning_rate': 0.006793206793206794, 'optimizer_steps': 1866}


Train Iteration:  93%|█████████▎| 1871/2002 [08:12<00:26,  4.89it/s]

{'avg_chunks': 1.6, 'loss': 7.629854202270508, 'learning_rate': 0.006543456543456544, 'optimizer_steps': 1871}


Train Iteration:  94%|█████████▎| 1876/2002 [08:13<00:29,  4.26it/s]

{'avg_chunks': 2.6, 'loss': 7.504455661773681, 'learning_rate': 0.006293706293706295, 'optimizer_steps': 1876}


Train Iteration:  94%|█████████▍| 1881/2002 [08:14<00:28,  4.22it/s]

{'avg_chunks': 2.4, 'loss': 7.539145755767822, 'learning_rate': 0.006043956043956044, 'optimizer_steps': 1881}


Train Iteration:  94%|█████████▍| 1886/2002 [08:16<00:32,  3.56it/s]

{'avg_chunks': 3.2, 'loss': 7.463865852355957, 'learning_rate': 0.005794205794205795, 'optimizer_steps': 1886}


Train Iteration:  94%|█████████▍| 1891/2002 [08:17<00:33,  3.36it/s]

{'avg_chunks': 2.8, 'loss': 7.295683479309082, 'learning_rate': 0.005544455544455545, 'optimizer_steps': 1891}


Train Iteration:  95%|█████████▍| 1896/2002 [08:18<00:24,  4.40it/s]

{'avg_chunks': 2.0, 'loss': 7.722306442260742, 'learning_rate': 0.005294705294705295, 'optimizer_steps': 1896}


Train Iteration:  95%|█████████▌| 1902/2002 [08:20<00:22,  4.36it/s]

{'avg_chunks': 2.8, 'loss': 7.683979892730713, 'learning_rate': 0.005044955044955045, 'optimizer_steps': 1901}


Train Iteration:  95%|█████████▌| 1906/2002 [08:21<00:28,  3.36it/s]

{'avg_chunks': 2.2, 'loss': 7.230949974060058, 'learning_rate': 0.004795204795204795, 'optimizer_steps': 1906}


Train Iteration:  95%|█████████▌| 1911/2002 [08:22<00:24,  3.74it/s]

{'avg_chunks': 2.8, 'loss': 7.770898342132568, 'learning_rate': 0.004545454545454546, 'optimizer_steps': 1911}


Train Iteration:  96%|█████████▌| 1916/2002 [08:24<00:23,  3.65it/s]

{'avg_chunks': 2.4, 'loss': 7.132836055755615, 'learning_rate': 0.004295704295704296, 'optimizer_steps': 1916}


Train Iteration:  96%|█████████▌| 1921/2002 [08:25<00:18,  4.33it/s]

{'avg_chunks': 2.2, 'loss': 7.625317573547363, 'learning_rate': 0.004045954045954046, 'optimizer_steps': 1921}


Train Iteration:  96%|█████████▌| 1926/2002 [08:26<00:17,  4.26it/s]

{'avg_chunks': 2.4, 'loss': 7.080447578430176, 'learning_rate': 0.003796203796203796, 'optimizer_steps': 1926}


Train Iteration:  96%|█████████▋| 1931/2002 [08:27<00:18,  3.93it/s]

{'avg_chunks': 2.8, 'loss': 7.267561626434326, 'learning_rate': 0.0035464535464535467, 'optimizer_steps': 1931}


Train Iteration:  97%|█████████▋| 1936/2002 [08:29<00:15,  4.35it/s]

{'avg_chunks': 2.2, 'loss': 7.511083602905273, 'learning_rate': 0.003296703296703297, 'optimizer_steps': 1936}


Train Iteration:  97%|█████████▋| 1941/2002 [08:30<00:14,  4.18it/s]

{'avg_chunks': 2.4, 'loss': 7.431461238861084, 'learning_rate': 0.003046953046953047, 'optimizer_steps': 1941}


Train Iteration:  97%|█████████▋| 1946/2002 [08:31<00:16,  3.30it/s]

{'avg_chunks': 3.0, 'loss': 7.262905693054199, 'learning_rate': 0.002797202797202797, 'optimizer_steps': 1946}


Train Iteration:  97%|█████████▋| 1951/2002 [08:32<00:12,  3.98it/s]

{'avg_chunks': 2.2, 'loss': 7.3323211669921875, 'learning_rate': 0.0025474525474525477, 'optimizer_steps': 1951}


Train Iteration:  98%|█████████▊| 1956/2002 [08:34<00:10,  4.48it/s]

{'avg_chunks': 2.0, 'loss': 7.638689613342285, 'learning_rate': 0.0022977022977022977, 'optimizer_steps': 1956}


Train Iteration:  98%|█████████▊| 1961/2002 [08:35<00:10,  4.07it/s]

{'avg_chunks': 2.8, 'loss': 7.005845642089843, 'learning_rate': 0.002047952047952048, 'optimizer_steps': 1961}


Train Iteration:  98%|█████████▊| 1966/2002 [08:36<00:09,  3.73it/s]

{'avg_chunks': 2.8, 'loss': 7.704857444763183, 'learning_rate': 0.0017982017982017984, 'optimizer_steps': 1966}


Train Iteration:  98%|█████████▊| 1971/2002 [08:38<00:08,  3.68it/s]

{'avg_chunks': 2.6, 'loss': 7.036925315856934, 'learning_rate': 0.0015484515484515485, 'optimizer_steps': 1971}


Train Iteration:  99%|█████████▊| 1976/2002 [08:39<00:05,  4.92it/s]

{'avg_chunks': 1.8, 'loss': 7.111443901062012, 'learning_rate': 0.001298701298701299, 'optimizer_steps': 1976}


Train Iteration:  99%|█████████▉| 1982/2002 [08:40<00:03,  5.11it/s]

{'avg_chunks': 2.2, 'loss': 7.341419124603272, 'learning_rate': 0.001048951048951049, 'optimizer_steps': 1981}


Train Iteration:  99%|█████████▉| 1986/2002 [08:44<00:09,  1.73it/s]

{'avg_chunks': 5.0, 'loss': 7.663602447509765, 'learning_rate': 0.0007992007992007992, 'optimizer_steps': 1986}


Train Iteration:  99%|█████████▉| 1991/2002 [08:46<00:04,  2.29it/s]

{'avg_chunks': 2.8, 'loss': 7.306700897216797, 'learning_rate': 0.0005494505494505496, 'optimizer_steps': 1991}


Train Iteration: 100%|█████████▉| 1996/2002 [08:47<00:01,  3.59it/s]

{'avg_chunks': 2.4, 'loss': 7.309193229675293, 'learning_rate': 0.0002997002997002997, 'optimizer_steps': 1996}


Train Iteration: 100%|█████████▉| 2000/2002 [08:48<00:00,  3.61it/s]

{'avg_chunks': 2.6, 'loss': 7.354541969299317, 'learning_rate': 4.995004995004995e-05, 'optimizer_steps': 2001}



Val Iteration:   0%|          | 0/222 [00:00<?, ?it/s][A
Val Iteration:   0%|          | 1/222 [00:00<00:31,  7.13it/s][A

Save completed. xlnet_trainer_checkpoints/chkpt_2000



Val Iteration:   5%|▍         | 11/222 [00:00<00:22,  9.52it/s][A
Val Iteration:   9%|▉         | 21/222 [00:00<00:15, 12.77it/s][A
Val Iteration:  14%|█▍        | 31/222 [00:00<00:11, 16.77it/s][A
Val Iteration:  18%|█▊        | 40/222 [00:00<00:08, 22.06it/s][A
Val Iteration:  21%|██        | 46/222 [00:00<00:07, 23.89it/s][A
Val Iteration:  23%|██▎       | 51/222 [00:01<00:06, 24.56it/s][A
Val Iteration:  27%|██▋       | 59/222 [00:01<00:05, 30.97it/s][A
Val Iteration:  29%|██▉       | 65/222 [00:01<00:05, 27.20it/s][A
Val Iteration:  32%|███▏      | 71/222 [00:01<00:04, 30.37it/s][A
Val Iteration:  36%|███▋      | 81/222 [00:02<00:04, 31.33it/s][A
Val Iteration:  41%|████      | 91/222 [00:02<00:05, 24.45it/s][A
Val Iteration:  45%|████▌     | 100/222 [00:02<00:03, 31.00it/s][A
Val Iteration:  47%|████▋     | 105/222 [00:02<00:03, 29.73it/s][A
Val Iteration:  50%|█████     | 111/222 [00:03<00:03, 29.63it/s][A
Val Iteration:  54%|█████▎    | 119/222 [00:03<00:02, 36.3

Validation loss: 7.494535280310589





#Evaluation

In [12]:
with open(os.path.join("xlnet_trainer_checkpoints/chkpt_0/chunk_config.json"), "r") as chunk_cfg:
  chunk_config = json.load(chunk_cfg)
chunk_seq_len = chunk_config["max_seq_len"]
target_pred_max_len = chunk_config["predict_len"]

In [13]:
model.eval()
torch.set_grad_enabled(False)
results = {"meta": chunk_config, "results": []}
do_sampling = False

In [25]:
result = { "prompt": [], "generated": [], "actual": [] }

device = 'cuda'
for i in range(len(val_set)):
    chunked_data = val_set[random.randint(0, len(val_set) - 1)]
    full_text = []

    # Start by establishing mems state - achieved by doing forward passes on all chunks but the last one.
    mems = None
    num_chunks = len(chunked_data["input_ids"])
    for c in range(num_chunks - 1):
        full_text.extend(chunked_data["input_ids"][c].tolist())

        model_inputs = {
            "input_ids": chunked_data["input_ids"][c].unsqueeze(0).to(device),
            "attention_mask": chunked_data["attention_masks"][c]
            .unsqueeze(0)
            .to(device),
            "perm_mask": chunked_data["permutation_masks"][c].unsqueeze(0).to(device),
            "target_mapping": chunked_data["target_mapping"]
            .unsqueeze(0)
            .to(device),
        }
        if mems is not None:
            model_inputs["mems"] = mems

        logits, mems = model.forward(**model_inputs)

    # Now get the input IDs minus the target* for the last chunk. This will serve as the "prompt" for the model.
    text_len = chunk_seq_len - target_pred_max_len
    prompt_inputs = chunked_data["input_ids"][-1][0:text_len]
    prompt_inputs = prompt_inputs.to(device)
    full_text.extend(prompt_inputs)
    prompt_inputs = prompt_inputs.unsqueeze(dim=0)  # generate() expects batched inputs.

    # Use the transformers generate function to do the actual generation now.
    if do_sampling:
        genned_results = model.generate(
            prompt_inputs,
            max_length=150,
            min_length=40,
            do_sample=True,
            num_beams=4,
            temperature=0.7,
            top_k=0,
            top_p=0.9,
            repetition_penalty=5,
            eos_token_id=tok.eos_token_id,
            num_return_sequences=5,
            mems=mems,
        )
    else:
        genned_results = model.generate(
            prompt_inputs,
            max_length=400,
            min_length=300,
            do_sample=False,
            num_beams=12,
            repetition_penalty=3.0,
            eos_token_id=tok.eos_token_id,
            num_return_sequences=1,
            mems=None,
        )

    # Append results here.
    seqs, _ = genned_results.shape
    genned_texts = []
    prompt = tok.decode(prompt_inputs[0])
    print(
        "\n------------------------------------------------------------------------------"
    )
    print("PROMPT: `%s`" % (prompt))
    for s in range(seqs):
        genned_texts.append(tok.decode(genned_results[s][text_len:]))
        print("GENERATED: `%s`" % (genned_texts[-1]))
    result["prompt"].append(tok.decode(full_text))
    result["generated"].append(genned_texts)
    result["actual"].append(tok.decode(chunked_data["labels"]))
    print(
        "------------------------------------------------------------------------------"
    )


------------------------------------------------------------------------------
PROMPT: `programme your microwave from your TV, for example. This digital home of the future depends on the widespread adoption of the Cell processor and there are, as with all things, a number of reasons it could fail. Because the processor is so different, it requires programmers to learn a different way of writing software, and it may be that the changeover is simply too difficult for them to master. You can also guarantee that Microsoft and Intel are not going to sit around and let Cell take over home computing without a fight. Microsoft is going to be pushing its Xbox 2 as hard as possible to make sure that its technology, not Sony's, will be under your tree next Christmas. Intel will be furiously working on new designs that address the problems of its current chips to create a rival technology to Cell, so that it doesn't lose its desktop PC dominance. If Cell succeeds in becoming the living room techn

In [26]:
# Because the actual output (500~1000 words) is usually much longer than the
# predicted output, instead of how many n-grams are matched in the actual
# output, we compute the number of bigrams in the predicted output that also
# appear in the actual output.
#
# Params
# n: n-gram to compare.
def computeCustomizedRougeN(predicted_output: str, actual_output: str, n: int):
  predicted_words = predicted_output.split()
  actual_words = actual_output.split()
  assert n <= len(predicted_words) and n <= len(actual_words)
  predicted_ngrams = [''.join(predicted_words[i : i + n]) for i in range(len(predicted_words) - n + 1)]
  actual_ngrams = set([''.join(actual_words[i : i + n]) for i in range(len(actual_words) - n + 1)])
  num_matches = 0
  for predicted_ngram in predicted_ngrams:
    if predicted_ngram in actual_ngrams:
      num_matches += 1
  return num_matches / len(predicted_ngrams)

In [31]:
# result = { "prompt": [], "generated": [], "actual": [] }
for i in range(20):
  print(computeCustomizedRougeN(result['generated'][i][0], result['actual'][i], 1))

0.0
0.0
0.0
0.0
0.03389830508474576
0.03389830508474576
0.03508771929824561
0.0
0.0
0.0
0.034482758620689655
0.03571428571428571
0.0
0.0
0.01818181818181818
0.0
0.01694915254237288
0.0
0.0
0.0
