In [1]:
from tqdm import tqdm
from torch.utils.data.dataloader import DataLoader

from google.colab import drive
import os
drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/NLP')

from param_partitioner import *
from dataset import *
from models import *

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
class Trainer:
    def __init__(self, model, optimiser, dataset, hyp):
        self.model = model
        self.optimiser = optimiser
        self.dataset = dataset
        self.hyp = hyp
        self.test_dataset = None
        self.sample_rate = 100

        self.device = 'cpu'
        if torch.cuda.is_available():
            self.device = torch.cuda.current_device()
            self.model = torch.nn.DataParallel(self.model).to(self.device)
    
    def load(self, model_object):
        self.model = model_object
        self.model.load_state_dict(torch.load('state/model'))
        if self.device != 'cpu': self.model = torch.nn.DataParallel(self.model)
        self.model = self.model.to(self.device)

    def sample(self, prompt='', temperature=1):
        if prompt is '':
            x = torch.randint(self.dataset.vocab_size,(1,1))
        else:
            x = [[self.dataset.s2i(s) for s in prompt]]

        x = torch.tensor(x, dtype=torch.long).to(self.device)
        for i in range(100):
            logits = self.model(x)
            logits = logits[:, -1, :] / temperature
            probs = F.softmax(logits, dim=-1)
            w_ix = torch.multinomial(probs, num_samples=1)
            # _, w_ix = torch.topk(probs, k=1, dim=-1)
            x = torch.cat((x, w_ix), dim=1)
            prompt += self.dataset.i2s(int(w_ix))
            if prompt[-1]=='_': break
        return prompt

    def train(self):
        hyp = self.hyp
        def run_epoch(split):
            is_train = split == 'train'
            model.train(is_train)
            loader = DataLoader(self.dataset, shuffle=True, pin_memory=True,
                                batch_size=hyp.batch_size,
                                num_workers=hyp.num_workers)
            losses = []
            pbar = tqdm(enumerate(loader), total=len(loader)) if is_train else enumerate(loader)
            for it, (x, y) in pbar:
                x = x.to(self.device)
                y = y.to(self.device)
                
                with torch.set_grad_enabled(is_train):
                    logits = model(x)
                    loss = F.cross_entropy(logits.view(-1, logits.size(-1)), y.view(-1))

                    loss = loss.mean()
                    losses.append(loss.item())

                if is_train:
                    if it%self.sample_rate==0 and self.dataset.selfsupervised: 
                        torch.save(model.state_dict(), 'state/model')
                        print( self.sample(self.dataset.rnd_ts()) )
                        
                    model.zero_grad()
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), hyp.grad_norm_clip)
                    self.optimiser.step()

                    lr = hyp.learning_rate
                    
                    pbar.set_description(f"epoch {epoch+1} iter {it}: train loss {loss.item():.5f}. lr {lr:e}")

            if not is_train:
                test_loss = float(np.mean(losses))
                logger.info("test loss: %f", test_loss)
                return test_loss

        best_loss = float('inf')
        self.tokens = 0 # counter used for learning rate decay
        for epoch in range(hyp.max_epochs):

            run_epoch('train')
            if self.test_dataset is not None:
                test_loss = run_epoch('test')

            lr = hyp.learning_rate * (hyp.lr_mult ** epoch)
            for param_group in self.optimiser.param_groups:
                param_group['lr'] = lr


class Hyperparameters:
    batch_size = 32
    num_workers = 8
    block_size = 128#50
    h_dim = 256
    grad_norm_clip = 0.5
    learning_rate = 0.003
    lr_mult = 0.9
    max_epochs = 1
    nhead = 4
    dropout = 0.5
    nlayers = 2
    max_T = block_size//2
    betas = (0.9, 0.95)

In [3]:
n = int(10**5)
(tr_xs, tr_ys), (ts_xs, ts_ys) = generate_data(n, difficulty=3, extra=False)

# encode these as batched tensors
# encode (x, y) as tensors add end of stream token
hyp = Hyperparameters()
dataset = CharDataset(tr_xs, tr_ys, ts_xs, ts_ys, hyp.block_size)

model = M3(dataset.vocab_size, hyp)

# initialise model parameters
weight_kwargs = {'Embedding':(nn.init.normal_, [], {'mean':0, 'std':0.02}),
                  'Linear':(nn.init.normal_, [], {'mean':0, 'std':0.02}),
                  'LayerNorm':(nn.init.constant_, [1], {})}
bias_kwargs =   {'Default':(nn.init.constant_, [0], {})}
apply_initialisation(model, (weight_kwargs, bias_kwargs))

# initialise optimiser
conditions = ['weight', 'pos_emb']
combine = {'weights':[[1,1],[1,0]],
            'rest':[[0,0],[0,1]],
            'other': None}
partition = multiple_partition(model, conditions, comb=combine)

for k, v in partition.items(): print('\n'+k+'\n%s'%v)

typ_opt_kwargs = {'weights': {"weight_decay": 0.1}, 
                  'rest':   {"weight_decay": 0.0},
                  'other':   {}}
opt_groups = optimisation_groups(model, partition, typ_opt_kwargs)
optimiser = torch.optim.AdamW(opt_groups, lr=hyp.learning_rate, betas=hyp.betas) 

data has 51092 characters, 33 unique.
M3
Embedding
Sequential
Block
MultiHeadMaskedSelfAttention
Linear
Linear
Linear
Sequential
Linear
GELU
Linear
Dropout
Block
MultiHeadMaskedSelfAttention
Linear
Linear
Linear
Sequential
Linear
GELU
Linear
Dropout
Linear
LayerNorm

weights
['blocks.1.mlp.0.weight', 'blocks.0.mlp.2.weight', 'prj.weight', 'blocks.0.atn.val.weight', 'blocks.0.atn.qry.weight', 'blocks.1.atn.key.weight', 'blocks.0.mlp.0.weight', 'blocks.1.atn.qry.weight', 'blocks.1.atn.val.weight', 'blocks.1.mlp.2.weight', 'tok_emb.weight', 'blocks.0.atn.key.weight', 'ln.weight']

rest
['blocks.0.mlp.0.bias', 'blocks.1.atn.val.bias', 'blocks.1.atn.qry.bias', 'blocks.1.atn.key.bias', 'prj.bias', 'blocks.1.mlp.0.bias', 'blocks.0.mlp.2.bias', 'ln.bias', 'blocks.0.atn.qry.bias', 'blocks.0.atn.key.bias', 'blocks.1.mlp.2.bias', 'blocks.0.atn.val.bias', 'pos_emb']

other
[]


In [4]:
trainer = Trainer(model, optimiser, dataset, hyp)
trainer.train()
# trainer.load(M3(dataset.vocab_size, hyp))

# # then do supervised fine tuning and only train the extended part of the model
# trainer.dataset.selfsupervised = False

# '''
# could feed in x with no mask
# then pull out next value 10 times, compare to y
# '''

# extended_model = Extention(model, dataset.vocab_size, hyp)

# hyp.max_epochs = 1
# trainer = Trainer(extended_model, dataset, hyp)# turn off grad for end params!
# trainer.train()

# for (x, y) in trainer.dataset:
#     print(''.join([trainer.dataset.i2s(i) for i in x]), end='   ')
#     print(''.join([trainer.dataset.i2s(i) for i in y]))
#     yhat = trainer.model(x.unsqueeze(0))[0]
#     _, w_ix = torch.topk(F.softmax(yhat, dim=-1), k=1, dim=-1)
#     print(''.join([trainer.dataset.i2s(i) for i in w_ix]), end='\n\n')


epoch 1 iter 6: train loss 1.86064. lr 3.000000e-03:   0%|          | 5/37726 [00:00<1:26:00,  7.31it/s]

multiply 7 by 10lsu 0pev7e36-t6mco7n5u4p4.p5:mpm49e14_


epoch 1 iter 108: train loss 1.39333. lr 3.000000e-03:   0%|          | 109/37726 [00:02<16:19, 38.40it/s]

multiply 324 by -443 218 and:309_


epoch 1 iter 207: train loss 1.09336. lr 3.000000e-03:   1%|          | 204/37726 [00:05<17:10, 36.40it/s]

add -1 and 4 and -3:16239_


epoch 1 iter 308: train loss 1.05549. lr 3.000000e-03:   1%|          | 309/37726 [00:07<15:45, 39.57it/s]

multiply 490 by -20:10950_


epoch 1 iter 408: train loss 1.00166. lr 3.000000e-03:   1%|          | 409/37726 [00:09<15:34, 39.95it/s]

subtract 3 from -432:43_


epoch 1 iter 508: train loss 0.98778. lr 3.000000e-03:   1%|▏         | 509/37726 [00:12<15:59, 38.79it/s]

divide -3 by -383:14.01004_


epoch 1 iter 608: train loss 0.98226. lr 3.000000e-03:   2%|▏         | 609/37726 [00:14<15:33, 39.76it/s]

multiply -4 by -305:2393_


epoch 1 iter 708: train loss 0.98616. lr 3.000000e-03:   2%|▏         | 709/37726 [00:17<15:26, 39.97it/s]

divide 419 by -243:-1.4479_


epoch 1 iter 808: train loss 0.95538. lr 3.000000e-03:   2%|▏         | 809/37726 [00:19<15:17, 40.23it/s]

add 2 and -397:-944_


epoch 1 iter 908: train loss 0.94581. lr 3.000000e-03:   2%|▏         | 909/37726 [00:21<15:29, 39.61it/s]

multiply 305 by -60:-900_


epoch 1 iter 1008: train loss 0.97590. lr 3.000000e-03:   3%|▎         | 1009/37726 [00:24<15:29, 39.49it/s]

divide 381 by 197:4.6689_


epoch 1 iter 1108: train loss 0.93947. lr 3.000000e-03:   3%|▎         | 1109/37726 [00:26<15:55, 38.34it/s]

add -303 and 472:-736_


epoch 1 iter 1208: train loss 0.95141. lr 3.000000e-03:   3%|▎         | 1209/37726 [00:29<15:23, 39.53it/s]

subtract 94 from 135:295_


epoch 1 iter 1307: train loss 0.92092. lr 3.000000e-03:   3%|▎         | 1304/37726 [00:31<15:39, 38.77it/s]

multiply -19 by -24:3746_


epoch 1 iter 1407: train loss 0.90620. lr 3.000000e-03:   4%|▎         | 1404/37726 [00:33<15:44, 38.45it/s]

multiply 5 by -346:-148_


epoch 1 iter 1508: train loss 0.90011. lr 3.000000e-03:   4%|▍         | 1509/37726 [00:36<15:26, 39.11it/s]

multiply -459 by 313:-6922_


epoch 1 iter 1607: train loss 0.90477. lr 3.000000e-03:   4%|▍         | 1604/37726 [00:38<15:45, 38.22it/s]

subtract 18 from -5:-32_


epoch 1 iter 1708: train loss 0.89979. lr 3.000000e-03:   5%|▍         | 1709/37726 [00:41<15:14, 39.39it/s]

multiply -1 by -421:4794_


epoch 1 iter 1808: train loss 0.89712. lr 3.000000e-03:   5%|▍         | 1808/37726 [00:43<15:03, 39.73it/s]

subtract -490 from -471:830_


epoch 1 iter 1907: train loss 0.89864. lr 3.000000e-03:   5%|▌         | 1908/37726 [00:45<15:27, 38.60it/s]

divide 65 by -246:-0.30853_


epoch 1 iter 2007: train loss 0.87841. lr 3.000000e-03:   5%|▌         | 2007/37726 [00:48<15:22, 38.74it/s]

subtract -3 from -230:-221_


epoch 1 iter 2108: train loss 0.90644. lr 3.000000e-03:   6%|▌         | 2107/37726 [00:50<14:40, 40.46it/s]

add 4 and 153:134_


epoch 1 iter 2208: train loss 0.88855. lr 3.000000e-03:   6%|▌         | 2207/37726 [00:52<14:42, 40.25it/s]

add -3 and -382:-272_


epoch 1 iter 2307: train loss 0.85329. lr 3.000000e-03:   6%|▌         | 2307/37726 [00:55<15:06, 39.09it/s]

divide -2 by -492:0.0004_


epoch 1 iter 2407: train loss 0.86493. lr 3.000000e-03:   6%|▋         | 2407/37726 [00:57<15:01, 39.20it/s]

subtract 170 from -273:-320_


epoch 1 iter 2507: train loss 0.88365. lr 3.000000e-03:   7%|▋         | 2507/37726 [01:00<14:57, 39.22it/s]

multiply -241 by -21:8718_


epoch 1 iter 2607: train loss 0.84561. lr 3.000000e-03:   7%|▋         | 2607/37726 [01:02<15:05, 38.80it/s]

divide 34 by -1:-34.0000_


epoch 1 iter 2708: train loss 0.82974. lr 3.000000e-03:   7%|▋         | 2707/37726 [01:04<14:46, 39.48it/s]

divide 1 by -397:-0.0029_


epoch 1 iter 2807: train loss 0.86699. lr 3.000000e-03:   7%|▋         | 2807/37726 [01:07<14:32, 40.02it/s]

multiply 253 by 3:6182_


epoch 1 iter 2908: train loss 0.84750. lr 3.000000e-03:   8%|▊         | 2907/37726 [01:09<14:51, 39.07it/s]

divide -7 by 9:-5.0000_


epoch 1 iter 3008: train loss 0.84940. lr 3.000000e-03:   8%|▊         | 3007/37726 [01:12<14:34, 39.70it/s]

multiply -42 by 37:-1157_


epoch 1 iter 3107: train loss 0.84920. lr 3.000000e-03:   8%|▊         | 3107/37726 [01:14<15:09, 38.06it/s]

multiply 3 by 160:540_


epoch 1 iter 3208: train loss 0.84702. lr 3.000000e-03:   9%|▊         | 3207/37726 [01:16<14:41, 39.17it/s]

divide 230 by -38:-6.6925_


epoch 1 iter 3307: train loss 0.85556. lr 3.000000e-03:   9%|▉         | 3307/37726 [01:19<14:32, 39.47it/s]

subtract -92 from 6:95_


epoch 1 iter 3408: train loss 0.84800. lr 3.000000e-03:   9%|▉         | 3407/37726 [01:21<14:25, 39.64it/s]

multiply -5 by 15:-60_


epoch 1 iter 3508: train loss 0.84759. lr 3.000000e-03:   9%|▉         | 3507/37726 [01:23<14:15, 39.99it/s]

subtract 1 from -118:-169_


epoch 1 iter 3608: train loss 0.82557. lr 3.000000e-03:  10%|▉         | 3607/37726 [01:26<14:24, 39.46it/s]

divide -108 by -44:4.1641_


epoch 1 iter 3707: train loss 0.82117. lr 3.000000e-03:  10%|▉         | 3707/37726 [01:28<14:40, 38.64it/s]

divide 5 by -273:-0.0126_


epoch 1 iter 3808: train loss 0.84597. lr 3.000000e-03:  10%|█         | 3806/37726 [01:31<14:42, 38.44it/s]

divide -155 by 486:-0.2503_


epoch 1 iter 3907: train loss 0.85204. lr 3.000000e-03:  10%|█         | 3906/37726 [01:33<14:03, 40.10it/s]

add -467 and -344:-85_


epoch 1 iter 4007: train loss 0.84945. lr 3.000000e-03:  11%|█         | 4006/37726 [01:35<14:42, 38.21it/s]

multiply 482 by 451:176338_


epoch 1 iter 4108: train loss 0.84133. lr 3.000000e-03:  11%|█         | 4105/37726 [01:38<14:45, 37.95it/s]

subtract 456 from -23:-461_


epoch 1 iter 4208: train loss 0.85368. lr 3.000000e-03:  11%|█         | 4205/37726 [01:40<14:33, 38.36it/s]

add -15 and -106:-110_


epoch 1 iter 4307: train loss 0.82312. lr 3.000000e-03:  11%|█▏        | 4305/37726 [01:43<14:47, 37.66it/s]

divide 1 by -250:-0.0405_


epoch 1 iter 4407: train loss 0.84321. lr 3.000000e-03:  12%|█▏        | 4408/37726 [01:45<14:20, 38.74it/s]

add 354 and 256:421_


epoch 1 iter 4507: train loss 0.82236. lr 3.000000e-03:  12%|█▏        | 4508/37726 [01:47<14:14, 38.87it/s]

subtract 491 from 12:-366_


epoch 1 iter 4607: train loss 0.84408. lr 3.000000e-03:  12%|█▏        | 4607/37726 [01:50<14:32, 37.94it/s]

multiply -232 by -246:685426_


epoch 1 iter 4707: train loss 0.82254. lr 3.000000e-03:  12%|█▏        | 4707/37726 [01:52<14:01, 39.22it/s]

add 348 and -12:335_


epoch 1 iter 4807: train loss 0.86701. lr 3.000000e-03:  13%|█▎        | 4806/37726 [01:55<14:16, 38.42it/s]

subtract -399 from -12:327_


epoch 1 iter 4907: train loss 0.83206. lr 3.000000e-03:  13%|█▎        | 4905/37726 [01:57<15:14, 35.90it/s]

add -349 and 5:-ad 346:-831_


epoch 1 iter 5007: train loss 0.80825. lr 3.000000e-03:  13%|█▎        | 5007/37726 [02:00<14:05, 38.71it/s]

add -24 and 225:235_


epoch 1 iter 5108: train loss 0.83293. lr 3.000000e-03:  14%|█▎        | 5107/37726 [02:02<14:19, 37.96it/s]

divide -4 by 157:-0.0364_


epoch 1 iter 5207: train loss 0.83968. lr 3.000000e-03:  14%|█▍        | 5207/37726 [02:04<13:57, 38.83it/s]

subtract 35 from 33:-78_


epoch 1 iter 5307: train loss 0.80656. lr 3.000000e-03:  14%|█▍        | 5307/37726 [02:07<13:52, 38.95it/s]

subtract -2 from -265:-260_


epoch 1 iter 5408: train loss 0.81080. lr 3.000000e-03:  14%|█▍        | 5406/37726 [02:09<13:48, 39.02it/s]

divide -9 by -375:0.0328_


epoch 1 iter 5508: train loss 0.80416. lr 3.000000e-03:  15%|█▍        | 5506/37726 [02:12<13:49, 38.85it/s]

divide -16 by 125:-0.1123_


epoch 1 iter 5607: train loss 0.79208. lr 3.000000e-03:  15%|█▍        | 5605/37726 [02:14<13:53, 38.55it/s]

subtract -136 from 458:312_


epoch 1 iter 5707: train loss 0.82078. lr 3.000000e-03:  15%|█▌        | 5704/37726 [02:16<14:25, 36.99it/s]

divide -94 by 461:-0.2159_


epoch 1 iter 5808: train loss 0.82043. lr 3.000000e-03:  15%|█▌        | 5809/37726 [02:19<13:15, 40.10it/s]

add 12 and -2:9_


epoch 1 iter 5907: train loss 0.80992. lr 3.000000e-03:  16%|█▌        | 5904/37726 [02:21<13:45, 38.56it/s]

add 292 and -35:195_


epoch 1 iter 6008: train loss 0.82607. lr 3.000000e-03:  16%|█▌        | 6009/37726 [02:24<13:39, 38.71it/s]

multiply -48 by -352:170324_


epoch 1 iter 6108: train loss 0.78905. lr 3.000000e-03:  16%|█▌        | 6109/37726 [02:26<13:11, 39.95it/s]

subtract -338 from -195:349_


epoch 1 iter 6207: train loss 0.79693. lr 3.000000e-03:  16%|█▋        | 6204/37726 [02:28<13:48, 38.06it/s]

add -40 and 494:482_


epoch 1 iter 6307: train loss 0.79726. lr 3.000000e-03:  17%|█▋        | 6308/37726 [02:31<13:46, 38.00it/s]

multiply -102 by 29:-58968_


epoch 1 iter 6408: train loss 0.78752. lr 3.000000e-03:  17%|█▋        | 6408/37726 [02:33<13:09, 39.68it/s]

multiply 39 by -396:-15824_


epoch 1 iter 6507: train loss 0.81623. lr 3.000000e-03:  17%|█▋        | 6508/37726 [02:36<13:13, 39.33it/s]

subtract 5 from 295:200_


epoch 1 iter 6607: train loss 0.81631. lr 3.000000e-03:  18%|█▊        | 6608/37726 [02:38<13:37, 38.07it/s]

add 1 and 249:240_


epoch 1 iter 6707: train loss 0.79646. lr 3.000000e-03:  18%|█▊        | 6708/37726 [02:41<13:10, 39.23it/s]

multiply 20 by 88:1920_


epoch 1 iter 6807: train loss 0.80270. lr 3.000000e-03:  18%|█▊        | 6808/37726 [02:43<13:09, 39.17it/s]

subtract -29 from 104:133_


epoch 1 iter 6907: train loss 0.80226. lr 3.000000e-03:  18%|█▊        | 6908/37726 [02:45<13:36, 37.76it/s]

divide -3 by 472:-0.0069_


epoch 1 iter 7007: train loss 0.79714. lr 3.000000e-03:  19%|█▊        | 7005/37726 [02:48<13:23, 38.25it/s]

multiply 1 by 462:462_


epoch 1 iter 7107: train loss 0.77902. lr 3.000000e-03:  19%|█▉        | 7105/37726 [02:50<13:55, 36.66it/s]

add 31 and 207:368_


epoch 1 iter 7207: train loss 0.80374. lr 3.000000e-03:  19%|█▉        | 7208/37726 [02:53<13:21, 38.07it/s]

multiply -36 by -308:11732_


epoch 1 iter 7307: train loss 0.78256. lr 3.000000e-03:  19%|█▉        | 7308/37726 [02:55<13:19, 38.05it/s]

divide -6 by -18:0.3888_


epoch 1 iter 7407: train loss 0.81245. lr 3.000000e-03:  20%|█▉        | 7407/37726 [02:58<12:54, 39.14it/s]

add 489 and 97:58_


epoch 1 iter 7508: train loss 0.80603. lr 3.000000e-03:  20%|█▉        | 7507/37726 [03:00<12:53, 39.08it/s]

divide -47 by 128:-0.3669_


epoch 1 iter 7608: train loss 0.79357. lr 3.000000e-03:  20%|██        | 7607/37726 [03:03<12:35, 39.89it/s]

add 447 and 386:493_


epoch 1 iter 7707: train loss 0.78320. lr 3.000000e-03:  20%|██        | 7707/37726 [03:05<12:59, 38.53it/s]

add -119 and -278:-339_


epoch 1 iter 7807: train loss 0.79826. lr 3.000000e-03:  21%|██        | 7806/37726 [03:07<12:41, 39.27it/s]

subtract 187 from -24:-251_


epoch 1 iter 7907: train loss 0.79098. lr 3.000000e-03:  21%|██        | 7905/37726 [03:10<13:15, 37.47it/s]

add -303 and 4:-203_


epoch 1 iter 8007: train loss 0.79189. lr 3.000000e-03:  21%|██        | 8007/37726 [03:12<12:48, 38.69it/s]

multiply -2 by -128:286_


epoch 1 iter 8107: train loss 0.78199. lr 3.000000e-03:  21%|██▏       | 8106/37726 [03:15<12:43, 38.80it/s]

subtract -10 from 489:469_


epoch 1 iter 8207: train loss 0.80972. lr 3.000000e-03:  22%|██▏       | 8206/37726 [03:17<12:50, 38.31it/s]

multiply -256 by 321:-92125_


epoch 1 iter 8307: train loss 0.78128. lr 3.000000e-03:  22%|██▏       | 8305/37726 [03:20<13:13, 37.08it/s]

multiply -50 by 326:-1750_


epoch 1 iter 8407: train loss 0.78936. lr 3.000000e-03:  22%|██▏       | 8408/37726 [03:22<12:41, 38.51it/s]

divide 477 by 277:1.8809_


epoch 1 iter 8507: train loss 0.78275. lr 3.000000e-03:  23%|██▎       | 8507/37726 [03:24<12:42, 38.32it/s]

subtract -205 from -330:695_


epoch 1 iter 8608: train loss 0.78427. lr 3.000000e-03:  23%|██▎       | 8607/37726 [03:27<12:20, 39.34it/s]

multiply -242 by -42:9640_


epoch 1 iter 8708: train loss 0.75209. lr 3.000000e-03:  23%|██▎       | 8707/37726 [03:29<12:21, 39.14it/s]

add 11 and 285:294_


epoch 1 iter 8808: train loss 0.79178. lr 3.000000e-03:  23%|██▎       | 8807/37726 [03:32<12:19, 39.12it/s]

multiply 176 by -458:-8678_


epoch 1 iter 8907: train loss 0.79085. lr 3.000000e-03:  24%|██▎       | 8907/37726 [03:34<12:11, 39.42it/s]

subtract -123 from 37:156_


epoch 1 iter 9007: train loss 0.78989. lr 3.000000e-03:  24%|██▍       | 9007/37726 [03:37<12:17, 38.94it/s]

multiply -31 by -333:10133_


epoch 1 iter 9107: train loss 0.80217. lr 3.000000e-03:  24%|██▍       | 9106/37726 [03:39<12:43, 37.49it/s]

divide -274 by 49:-4.7656_


epoch 1 iter 9208: train loss 0.77107. lr 3.000000e-03:  24%|██▍       | 9204/37726 [03:42<12:41, 37.48it/s]

multiply -2 by 417:-814_


epoch 1 iter 9308: train loss 0.79415. lr 3.000000e-03:  25%|██▍       | 9304/37726 [03:44<12:38, 37.49it/s]

subtract 372 from 414:832_


epoch 1 iter 9407: train loss 0.79032. lr 3.000000e-03:  25%|██▍       | 9408/37726 [03:46<12:55, 36.50it/s]

divide 153 by -450:-3.1125_


epoch 1 iter 9507: train loss 0.78196. lr 3.000000e-03:  25%|██▌       | 9507/37726 [03:49<12:16, 38.32it/s]

divide 86 by 20:3.2000_


epoch 1 iter 9607: train loss 0.77434. lr 3.000000e-03:  25%|██▌       | 9606/37726 [03:51<12:25, 37.72it/s]

multiply 29 by -55:-1295_


epoch 1 iter 9708: train loss 0.78188. lr 3.000000e-03:  26%|██▌       | 9706/37726 [03:54<11:57, 39.06it/s]

divide -15 by 42:-0.4322_


epoch 1 iter 9807: train loss 0.74723. lr 3.000000e-03:  26%|██▌       | 9806/37726 [03:56<12:18, 37.79it/s]

subtract 187 from 2:-175_


epoch 1 iter 9907: train loss 0.78458. lr 3.000000e-03:  26%|██▋       | 9905/37726 [03:59<12:42, 36.49it/s]

divide -12 by 50:-0.2800_


epoch 1 iter 10007: train loss 0.77653. lr 3.000000e-03:  27%|██▋       | 10005/37726 [04:01<12:25, 37.17it/s]

subtract 2 from -185:-187_


epoch 1 iter 10107: train loss 0.76144. lr 3.000000e-03:  27%|██▋       | 10105/37726 [04:04<12:19, 37.34it/s]

add -13 and 246:233_


epoch 1 iter 10207: train loss 0.78070. lr 3.000000e-03:  27%|██▋       | 10208/37726 [04:06<12:09, 37.73it/s]

add 33 and -251:-298_


epoch 1 iter 10307: train loss 0.78227. lr 3.000000e-03:  27%|██▋       | 10305/37726 [04:09<12:02, 37.93it/s]

multiply 1 by 36:36_


epoch 1 iter 10407: train loss 0.78607. lr 3.000000e-03:  28%|██▊       | 10407/37726 [04:11<11:43, 38.81it/s]

add -69 and 92:-202_


epoch 1 iter 10507: train loss 0.77743. lr 3.000000e-03:  28%|██▊       | 10506/37726 [04:14<11:49, 38.39it/s]

divide 1 by 14:0.0785_


epoch 1 iter 10608: train loss 0.78232. lr 3.000000e-03:  28%|██▊       | 10606/37726 [04:16<11:33, 39.09it/s]

subtract -359 from -23:342_


epoch 1 iter 10707: train loss 0.77197. lr 3.000000e-03:  28%|██▊       | 10705/37726 [04:18<12:08, 37.07it/s]

add -218 and 14:-172_


epoch 1 iter 10807: train loss 0.77049. lr 3.000000e-03:  29%|██▊       | 10804/37726 [04:21<12:02, 37.26it/s]

divide -285 by -5:52.0000_


epoch 1 iter 10908: train loss 0.76094. lr 3.000000e-03:  29%|██▉       | 10909/37726 [04:23<11:35, 38.54it/s]

multiply 189 by -445:-87555_


epoch 1 iter 11007: train loss 0.74156. lr 3.000000e-03:  29%|██▉       | 11004/37726 [04:26<12:07, 36.72it/s]

divide -48 by 14:-4.5000_


epoch 1 iter 11107: train loss 0.76751. lr 3.000000e-03:  29%|██▉       | 11107/37726 [04:28<12:01, 36.89it/s]

multiply -41 by 228:-9208_


epoch 1 iter 11207: train loss 0.76155. lr 3.000000e-03:  30%|██▉       | 11207/37726 [04:31<11:28, 38.50it/s]

multiply 425 by 109:60075_


epoch 1 iter 11307: train loss 0.80347. lr 3.000000e-03:  30%|██▉       | 11305/37726 [04:33<11:38, 37.82it/s]

subtract 2 from -141:-149_


epoch 1 iter 11407: train loss 0.79952. lr 3.000000e-03:  30%|███       | 11407/37726 [04:36<11:31, 38.04it/s]

divide 43 by 336:0.1202_


epoch 1 iter 11508: train loss 0.76618. lr 3.000000e-03:  31%|███       | 11507/37726 [04:38<11:14, 38.87it/s]

add 353 and -362:-765_


epoch 1 iter 11608: train loss 0.76929. lr 3.000000e-03:  31%|███       | 11606/37726 [04:41<11:18, 38.48it/s]

divide 39 by 174:0.2211_


epoch 1 iter 11707: train loss 0.73668. lr 3.000000e-03:  31%|███       | 11706/37726 [04:43<11:20, 38.21it/s]

divide -493 by 116:-3.3333_


epoch 1 iter 11807: train loss 0.77107. lr 3.000000e-03:  31%|███▏      | 11806/37726 [04:46<11:16, 38.34it/s]

multiply -323 by 264:-72372_


epoch 1 iter 11907: train loss 0.74888. lr 3.000000e-03:  32%|███▏      | 11906/37726 [04:48<11:12, 38.40it/s]

subtract -1 from 8:1_


epoch 1 iter 12007: train loss 0.79422. lr 3.000000e-03:  32%|███▏      | 12008/37726 [04:51<11:06, 38.60it/s]

subtract -453 from -172:514_


epoch 1 iter 12107: train loss 0.75218. lr 3.000000e-03:  32%|███▏      | 12108/37726 [04:53<11:01, 38.74it/s]

divide -32 by -40:0.7725_


epoch 1 iter 12207: train loss 0.77745. lr 3.000000e-03:  32%|███▏      | 12206/37726 [04:55<11:09, 38.12it/s]

subtract 29 from -295:-38_


epoch 1 iter 12307: train loss 0.74719. lr 3.000000e-03:  33%|███▎      | 12306/37726 [04:58<11:07, 38.07it/s]

add -1 and 256:255_


epoch 1 iter 12407: train loss 0.77793. lr 3.000000e-03:  33%|███▎      | 12406/37726 [05:00<10:58, 38.44it/s]

add 41 and -153:-108_


epoch 1 iter 12507: train loss 0.73844. lr 3.000000e-03:  33%|███▎      | 12506/37726 [05:03<10:51, 38.70it/s]

subtract 272 from 5:-277_


epoch 1 iter 12607: train loss 0.76780. lr 3.000000e-03:  33%|███▎      | 12606/37726 [05:05<10:43, 39.02it/s]

subtract 205 from 318:153_


epoch 1 iter 12707: train loss 0.73621. lr 3.000000e-03:  34%|███▎      | 12705/37726 [05:08<11:01, 37.80it/s]

add -387 and -50:-427_


epoch 1 iter 12807: train loss 0.77463. lr 3.000000e-03:  34%|███▍      | 12804/37726 [05:10<11:13, 37.02it/s]

multiply 4 by -251:-964_


epoch 1 iter 12908: train loss 0.75776. lr 3.000000e-03:  34%|███▍      | 12907/37726 [05:13<10:40, 38.76it/s]

divide 389 by -287:-1.1052_


epoch 1 iter 13007: train loss 0.76676. lr 3.000000e-03:  34%|███▍      | 13007/37726 [05:15<11:03, 37.24it/s]

multiply -386 by -449:169344_


epoch 1 iter 13107: train loss 0.76566. lr 3.000000e-03:  35%|███▍      | 13106/37726 [05:18<10:27, 39.22it/s]

add -12 and -382:-300_


epoch 1 iter 13207: train loss 0.77781. lr 3.000000e-03:  35%|███▌      | 13206/37726 [05:20<10:59, 37.19it/s]

multiply -44 by -179:948_


epoch 1 iter 13307: train loss 0.77125. lr 3.000000e-03:  35%|███▌      | 13305/37726 [05:23<10:42, 38.01it/s]

subtract 48 from 427:395_


epoch 1 iter 13407: train loss 0.77387. lr 3.000000e-03:  36%|███▌      | 13408/37726 [05:25<10:49, 37.45it/s]

divide -153 by -45:4.6000_


epoch 1 iter 13507: train loss 0.77559. lr 3.000000e-03:  36%|███▌      | 13506/37726 [05:28<10:43, 37.63it/s]

divide 275 by -15:-16.0000_


epoch 1 iter 13607: train loss 0.76159. lr 3.000000e-03:  36%|███▌      | 13605/37726 [05:30<10:42, 37.56it/s]

add -329 and -47:-312_


epoch 1 iter 13707: train loss 0.73969. lr 3.000000e-03:  36%|███▋      | 13705/37726 [05:32<10:47, 37.09it/s]

add 50 and -190:-140_


epoch 1 iter 13807: train loss 0.75338. lr 3.000000e-03:  37%|███▋      | 13805/37726 [05:35<10:45, 37.05it/s]

subtract 2 from 280:288_


epoch 1 iter 13907: train loss 0.79096. lr 3.000000e-03:  37%|███▋      | 13908/37726 [05:37<10:29, 37.85it/s]

subtract -41 from -5:38_


epoch 1 iter 14007: train loss 0.77051. lr 3.000000e-03:  37%|███▋      | 14006/37726 [05:40<10:25, 37.93it/s]

divide 472 by 46:9.9212_


epoch 1 iter 14107: train loss 0.74371. lr 3.000000e-03:  37%|███▋      | 14105/37726 [05:42<10:56, 35.98it/s]

add 1 and -349:-348_


epoch 1 iter 14207: train loss 0.76584. lr 3.000000e-03:  38%|███▊      | 14208/37726 [05:45<10:21, 37.81it/s]

add -18 and -137:-145_


epoch 1 iter 14307: train loss 0.74860. lr 3.000000e-03:  38%|███▊      | 14307/37726 [05:47<10:23, 37.58it/s]

add 11 and -102:-103_


epoch 1 iter 14407: train loss 0.76058. lr 3.000000e-03:  38%|███▊      | 14405/37726 [05:50<10:32, 36.85it/s]

subtract -34 from 176:220_


epoch 1 iter 14507: train loss 0.77847. lr 3.000000e-03:  38%|███▊      | 14508/37726 [05:52<09:59, 38.71it/s]

add -186 and 39:-137_


epoch 1 iter 14607: train loss 0.77031. lr 3.000000e-03:  39%|███▊      | 14607/37726 [05:55<10:17, 37.42it/s]

divide 5 by -34:-0.1442_


epoch 1 iter 14707: train loss 0.76598. lr 3.000000e-03:  39%|███▉      | 14704/37726 [05:57<10:41, 35.91it/s]

subtract 440 from 34:-382_


epoch 1 iter 14808: train loss 0.74098. lr 3.000000e-03:  39%|███▉      | 14805/37726 [06:00<10:15, 37.25it/s]

add -58 and 248:200_


epoch 1 iter 14907: train loss 0.75913. lr 3.000000e-03:  40%|███▉      | 14905/37726 [06:02<10:15, 37.06it/s]

subtract 145 from 4:-149_


epoch 1 iter 15007: train loss 0.76695. lr 3.000000e-03:  40%|███▉      | 15007/37726 [06:05<10:08, 37.36it/s]

subtract -1 from 97:98_


epoch 1 iter 15107: train loss 0.75587. lr 3.000000e-03:  40%|████      | 15108/37726 [06:07<10:08, 37.17it/s]

divide -183 by 51:-3.68594_


epoch 1 iter 15207: train loss 0.73979. lr 3.000000e-03:  40%|████      | 15208/37726 [06:10<09:56, 37.77it/s]

add -5 and -262:-267_


epoch 1 iter 15307: train loss 0.77283. lr 3.000000e-03:  41%|████      | 15308/37726 [06:12<10:07, 36.88it/s]

divide 3 by 403:0.0076_


epoch 1 iter 15407: train loss 0.72920. lr 3.000000e-03:  41%|████      | 15406/37726 [06:15<09:43, 38.27it/s]

multiply 35 by 89:2325_


epoch 1 iter 15507: train loss 0.77406. lr 3.000000e-03:  41%|████      | 15505/37726 [06:17<09:57, 37.22it/s]

subtract -29 from -33:-9_


epoch 1 iter 15607: train loss 0.76789. lr 3.000000e-03:  41%|████▏     | 15605/37726 [06:20<10:04, 36.60it/s]

add 196 and -2:194_


epoch 1 iter 15707: train loss 0.74637. lr 3.000000e-03:  42%|████▏     | 15708/37726 [06:22<09:33, 38.37it/s]

subtract -182 from -28:144_


epoch 1 iter 15807: train loss 0.78382. lr 3.000000e-03:  42%|████▏     | 15807/37726 [06:25<09:35, 38.06it/s]

add -221 and 106:-337_


epoch 1 iter 15907: train loss 0.77402. lr 3.000000e-03:  42%|████▏     | 15905/37726 [06:27<10:05, 36.05it/s]

divide 23 by 117:0.1828_


epoch 1 iter 16008: train loss 0.76353. lr 3.000000e-03:  42%|████▏     | 16008/37726 [06:30<09:37, 37.59it/s]

add 39 and -171:-150_


epoch 1 iter 16107: train loss 0.76068. lr 3.000000e-03:  43%|████▎     | 16108/37726 [06:32<09:30, 37.87it/s]

add 53 and 461:518_


epoch 1 iter 16207: train loss 0.72501. lr 3.000000e-03:  43%|████▎     | 16206/37726 [06:35<09:40, 37.08it/s]

add -256 and 6:-250_


epoch 1 iter 16307: train loss 0.74703. lr 3.000000e-03:  43%|████▎     | 16307/37726 [06:37<09:16, 38.49it/s]

subtract -411 from -24:387_


epoch 1 iter 16406: train loss 0.76055. lr 3.000000e-03:  43%|████▎     | 16406/37726 [06:40<09:20, 38.01it/s]

add -458 and 131:-367_


epoch 1 iter 16507: train loss 0.75253. lr 3.000000e-03:  44%|████▍     | 16508/37726 [06:42<09:09, 38.62it/s]

subtract -22 from -186:-194_


epoch 1 iter 16607: train loss 0.74858. lr 3.000000e-03:  44%|████▍     | 16608/37726 [06:45<09:05, 38.73it/s]

add 349 and -205:130_


epoch 1 iter 16707: train loss 0.73059. lr 3.000000e-03:  44%|████▍     | 16708/37726 [06:47<09:36, 36.45it/s]

subtract 469 from -187:-648_


epoch 1 iter 16807: train loss 0.74589. lr 3.000000e-03:  45%|████▍     | 16805/37726 [06:50<09:29, 36.76it/s]

multiply -27 by 470:-12600_


epoch 1 iter 16907: train loss 0.70968. lr 3.000000e-03:  45%|████▍     | 16908/37726 [06:52<09:06, 38.11it/s]

divide 5 by 480:0.0133_


epoch 1 iter 17007: train loss 0.73206. lr 3.000000e-03:  45%|████▌     | 17006/37726 [06:55<09:24, 36.73it/s]

add -17 and -140:-167_


epoch 1 iter 17107: train loss 0.72539. lr 3.000000e-03:  45%|████▌     | 17104/37726 [06:57<09:20, 36.78it/s]

add -289 and -81:-310_


epoch 1 iter 17207: train loss 0.75336. lr 3.000000e-03:  46%|████▌     | 17207/37726 [07:00<09:07, 37.46it/s]

add 350 and 21:462_


epoch 1 iter 17307: train loss 0.74991. lr 3.000000e-03:  46%|████▌     | 17308/37726 [07:02<08:58, 37.90it/s]

subtract -19 from 250:279_


epoch 1 iter 17407: train loss 0.73440. lr 3.000000e-03:  46%|████▌     | 17407/37726 [07:05<09:06, 37.18it/s]

multiply 47 by -303:-14211_


epoch 1 iter 17507: train loss 0.75583. lr 3.000000e-03:  46%|████▋     | 17506/37726 [07:07<08:44, 38.57it/s]

add 82 and 133:115_


epoch 1 iter 17607: train loss 0.75574. lr 3.000000e-03:  47%|████▋     | 17606/37726 [07:10<08:53, 37.74it/s]

subtract -36 from -12:13_


epoch 1 iter 17707: train loss 0.74268. lr 3.000000e-03:  47%|████▋     | 17708/37726 [07:12<08:55, 37.37it/s]

divide -148 by 454:-0.2061_


epoch 1 iter 17807: train loss 0.74011. lr 3.000000e-03:  47%|████▋     | 17807/37726 [07:15<08:36, 38.54it/s]

multiply 21 by 188:3278_


epoch 1 iter 17907: train loss 0.73576. lr 3.000000e-03:  47%|████▋     | 17906/37726 [07:17<08:29, 38.87it/s]

divide 176 by 499:0.3939_


epoch 1 iter 18007: train loss 0.75464. lr 3.000000e-03:  48%|████▊     | 18005/37726 [07:20<08:57, 36.67it/s]

divide 18 by -162:-0.1058_


epoch 1 iter 18107: train loss 0.75955. lr 3.000000e-03:  48%|████▊     | 18105/37726 [07:22<09:07, 35.81it/s]

subtract -6 from -175:-169_


epoch 1 iter 18207: train loss 0.71444. lr 3.000000e-03:  48%|████▊     | 18208/37726 [07:25<08:37, 37.70it/s]

divide -1 by 180:-0.0051_


epoch 1 iter 18307: train loss 0.76489. lr 3.000000e-03:  49%|████▊     | 18308/37726 [07:27<08:32, 37.91it/s]

divide -39 by 35:-1.0571_


epoch 1 iter 18407: train loss 0.76058. lr 3.000000e-03:  49%|████▉     | 18408/37726 [07:30<08:25, 38.18it/s]

multiply 144 by 1:144_


epoch 1 iter 18507: train loss 0.75586. lr 3.000000e-03:  49%|████▉     | 18506/37726 [07:32<08:40, 36.93it/s]

divide 38 by -8:-3.2500_


epoch 1 iter 18607: train loss 0.76485. lr 3.000000e-03:  49%|████▉     | 18605/37726 [07:35<08:32, 37.29it/s]

multiply 81 by -49:-3681_


epoch 1 iter 18707: train loss 0.74155. lr 3.000000e-03:  50%|████▉     | 18705/37726 [07:37<08:54, 35.60it/s]

divide -29 by 166:-1.5565_


epoch 1 iter 18807: train loss 0.75394. lr 3.000000e-03:  50%|████▉     | 18808/37726 [07:40<08:52, 35.53it/s]

divide -6 by 349:-0.01460_


epoch 1 iter 18907: train loss 0.74920. lr 3.000000e-03:  50%|█████     | 18905/37726 [07:42<08:35, 36.49it/s]

multiply 9 by 330:3570_


epoch 1 iter 19007: train loss 0.73382. lr 3.000000e-03:  50%|█████     | 19004/37726 [07:45<08:40, 35.94it/s]

multiply 3 by -13:-39_


epoch 1 iter 19107: train loss 0.73104. lr 3.000000e-03:  51%|█████     | 19106/37726 [07:47<08:22, 37.09it/s]

add -5 and 154:149_


epoch 1 iter 19208: train loss 0.72792. lr 3.000000e-03:  51%|█████     | 19207/37726 [07:50<08:05, 38.17it/s]

subtract -262 from 47:219_


epoch 1 iter 19307: train loss 0.74151. lr 3.000000e-03:  51%|█████     | 19307/37726 [07:52<08:33, 35.87it/s]

divide 335 by 420:0.7109_


epoch 1 iter 19407: train loss 0.75176. lr 3.000000e-03:  51%|█████▏    | 19406/37726 [07:55<08:03, 37.86it/s]

subtract -1 from 135:136_


epoch 1 iter 19507: train loss 0.75323. lr 3.000000e-03:  52%|█████▏    | 19506/37726 [07:57<08:23, 36.22it/s]

add 475 and -287:186_


epoch 1 iter 19607: train loss 0.72792. lr 3.000000e-03:  52%|█████▏    | 19605/37726 [08:00<07:57, 37.93it/s]

subtract -464 from -18:446_


epoch 1 iter 19708: train loss 0.72738. lr 3.000000e-03:  52%|█████▏    | 19705/37726 [08:02<08:03, 37.30it/s]

add -60 and 23:-27_


epoch 1 iter 19807: train loss 0.72853. lr 3.000000e-03:  52%|█████▏    | 19805/37726 [08:05<08:00, 37.29it/s]

subtract 113 from 44:-81_


epoch 1 iter 19907: train loss 0.73696. lr 3.000000e-03:  53%|█████▎    | 19905/37726 [08:07<08:30, 34.89it/s]

divide -448 by -157:3.0494_


epoch 1 iter 20007: train loss 0.75430. lr 3.000000e-03:  53%|█████▎    | 20008/37726 [08:10<07:47, 37.90it/s]

subtract -279 from -267:-298_


epoch 1 iter 20107: train loss 0.74298. lr 3.000000e-03:  53%|█████▎    | 20105/37726 [08:12<08:33, 34.30it/s]

add 87 and -3:84_


epoch 1 iter 20207: train loss 0.73647. lr 3.000000e-03:  54%|█████▎    | 20207/37726 [08:15<07:45, 37.64it/s]

subtract 5 from 478:473_


epoch 1 iter 20307: train loss 0.73302. lr 3.000000e-03:  54%|█████▍    | 20307/37726 [08:17<07:42, 37.70it/s]

subtract -500 from -5:405_


epoch 1 iter 20407: train loss 0.73739. lr 3.000000e-03:  54%|█████▍    | 20405/37726 [08:20<07:56, 36.38it/s]

add -390 and -16:-406_


epoch 1 iter 20507: train loss 0.75266. lr 3.000000e-03:  54%|█████▍    | 20505/37726 [08:22<07:59, 35.90it/s]

multiply -2 by 144:-288_


epoch 1 iter 20607: train loss 0.75850. lr 3.000000e-03:  55%|█████▍    | 20606/37726 [08:25<07:37, 37.39it/s]

subtract -8 from -5:7_


epoch 1 iter 20707: train loss 0.72294. lr 3.000000e-03:  55%|█████▍    | 20705/37726 [08:27<08:55, 31.81it/s]

add -48 and 19:subtract -3 from -125:-122_


epoch 1 iter 20807: train loss 0.72814. lr 3.000000e-03:  55%|█████▌    | 20806/37726 [08:30<07:24, 38.08it/s]

subtract -23 from 376:309_


epoch 1 iter 20907: train loss 0.74877. lr 3.000000e-03:  55%|█████▌    | 20905/37726 [08:32<07:41, 36.44it/s]

subtract 458 from 25:-493_


epoch 1 iter 21007: train loss 0.72869. lr 3.000000e-03:  56%|█████▌    | 21007/37726 [08:35<07:39, 36.36it/s]

divide -49 by -181:0.380_


epoch 1 iter 21107: train loss 0.74433. lr 3.000000e-03:  56%|█████▌    | 21108/37726 [08:37<07:36, 36.43it/s]

multiply 394 by -359:-128218_


epoch 1 iter 21207: train loss 0.73677. lr 3.000000e-03:  56%|█████▌    | 21205/37726 [08:40<07:21, 37.38it/s]

subtract 126 from 3:-123_


epoch 1 iter 21307: train loss 0.73506. lr 3.000000e-03:  56%|█████▋    | 21304/37726 [08:42<07:16, 37.58it/s]

add -47 and 175:128_


epoch 1 iter 21407: train loss 0.72713. lr 3.000000e-03:  57%|█████▋    | 21404/37726 [08:45<07:19, 37.11it/s]

add 12 and -28:-4_


epoch 1 iter 21507: train loss 0.72423. lr 3.000000e-03:  57%|█████▋    | 21507/37726 [08:47<07:02, 38.36it/s]

multiply 264 by 489:12565_


epoch 1 iter 21607: train loss 0.74058. lr 3.000000e-03:  57%|█████▋    | 21604/37726 [08:50<07:20, 36.56it/s]

subtract 204 from -256:-440_


epoch 1 iter 21707: train loss 0.74121. lr 3.000000e-03:  58%|█████▊    | 21706/37726 [08:52<07:23, 36.09it/s]

add -4 and 168:164_


epoch 1 iter 21807: train loss 0.73306. lr 3.000000e-03:  58%|█████▊    | 21808/37726 [08:55<06:58, 38.00it/s]

add -3 and 308:385_


epoch 1 iter 21907: train loss 0.75850. lr 3.000000e-03:  58%|█████▊    | 21905/37726 [08:57<07:21, 35.86it/s]

multiply 301 by -27:-8137_


epoch 1 iter 22007: train loss 0.72683. lr 3.000000e-03:  58%|█████▊    | 22005/37726 [09:00<07:26, 35.24it/s]

divide -28 by 436:-0.0587_


epoch 1 iter 22107: train loss 0.74692. lr 3.000000e-03:  59%|█████▊    | 22107/37726 [09:03<06:47, 38.34it/s]

add -43 and 74:31_


epoch 1 iter 22206: train loss 0.73697. lr 3.000000e-03:  59%|█████▉    | 22205/37726 [09:05<07:12, 35.88it/s]

multiply -35 by -20:700_


epoch 1 iter 22307: train loss 0.75070. lr 3.000000e-03:  59%|█████▉    | 22308/37726 [09:08<06:55, 37.09it/s]

divide -33 by 123:-0.2435_


epoch 1 iter 22407: train loss 0.73658. lr 3.000000e-03:  59%|█████▉    | 22405/37726 [09:10<07:11, 35.53it/s]

divide -41 by 44:-0.9882_


epoch 1 iter 22507: train loss 0.74222. lr 3.000000e-03:  60%|█████▉    | 22508/37726 [09:13<06:45, 37.54it/s]

add 182 and -38:154_


epoch 1 iter 22607: train loss 0.73562. lr 3.000000e-03:  60%|█████▉    | 22604/37726 [09:15<06:52, 36.65it/s]

multiply -32 by -37:1214_


epoch 1 iter 22707: train loss 0.73903. lr 3.000000e-03:  60%|██████    | 22706/37726 [09:18<07:00, 35.75it/s]

multiply 123 by 304:36372_


epoch 1 iter 22806: train loss 0.74256. lr 3.000000e-03:  60%|██████    | 22805/37726 [09:20<06:48, 36.51it/s]

add -390 and -54:-404_


epoch 1 iter 22907: train loss 0.72200. lr 3.000000e-03:  61%|██████    | 22908/37726 [09:23<06:31, 37.89it/s]

subtract 26 from 154:90_


epoch 1 iter 23007: train loss 0.74571. lr 3.000000e-03:  61%|██████    | 23005/37726 [09:25<06:42, 36.58it/s]

divide 43 by 419:0.1193_


epoch 1 iter 23107: train loss 0.72202. lr 3.000000e-03:  61%|██████    | 23105/37726 [09:28<06:47, 35.90it/s]

divide 43 by 446:0.0972_


epoch 1 iter 23207: train loss 0.73811. lr 3.000000e-03:  62%|██████▏   | 23204/37726 [09:30<06:26, 37.55it/s]

add 95 and 30:93_


epoch 1 iter 23307: train loss 0.74963. lr 3.000000e-03:  62%|██████▏   | 23305/37726 [09:33<06:30, 36.95it/s]

subtract 1 from 410:409_


epoch 1 iter 23407: train loss 0.72341. lr 3.000000e-03:  62%|██████▏   | 23405/37726 [09:35<06:38, 35.94it/s]

divide -376 by -166:2.2797_


epoch 1 iter 23507: train loss 0.72206. lr 3.000000e-03:  62%|██████▏   | 23504/37726 [09:38<06:59, 33.91it/s]

divide 287 by 39:9.5802_


epoch 1 iter 23607: train loss 0.72179. lr 3.000000e-03:  63%|██████▎   | 23606/37726 [09:40<06:16, 37.53it/s]

divide 41 by 316:0.1667_


epoch 1 iter 23707: train loss 0.73639. lr 3.000000e-03:  63%|██████▎   | 23708/37726 [09:43<06:18, 37.03it/s]

divide 19 by -103:-0.1471_


epoch 1 iter 23807: train loss 0.74332. lr 3.000000e-03:  63%|██████▎   | 23805/37726 [09:45<06:26, 35.99it/s]

subtract -4 from 462:466_


epoch 1 iter 23907: train loss 0.71332. lr 3.000000e-03:  63%|██████▎   | 23906/37726 [09:48<06:07, 37.62it/s]

multiply 130 by 2:260_


epoch 1 iter 24006: train loss 0.74911. lr 3.000000e-03:  64%|██████▎   | 24007/37726 [09:50<06:39, 34.35it/s]

multiply 4 by -119:-56_


epoch 1 iter 24107: train loss 0.72000. lr 3.000000e-03:  64%|██████▍   | 24105/37726 [09:53<06:08, 36.98it/s]

subtract -50 from 332:372_


epoch 1 iter 24207: train loss 0.74906. lr 3.000000e-03:  64%|██████▍   | 24205/37726 [09:56<06:15, 35.98it/s]

add 353 and 81:364_


epoch 1 iter 24307: train loss 0.72015. lr 3.000000e-03:  64%|██████▍   | 24308/37726 [09:58<06:12, 36.04it/s]

divide 9 by -69:-0.0153_


epoch 1 iter 24407: train loss 0.73058. lr 3.000000e-03:  65%|██████▍   | 24407/37726 [10:01<06:12, 35.73it/s]

multiply 120 by 1:120_


epoch 1 iter 24507: train loss 0.70200. lr 3.000000e-03:  65%|██████▍   | 24504/37726 [10:03<06:00, 36.65it/s]

subtract -136 from 458:674_


epoch 1 iter 24607: train loss 0.76562. lr 3.000000e-03:  65%|██████▌   | 24607/37726 [10:06<05:43, 38.24it/s]

subtract 26 from 11:-15_


epoch 1 iter 24707: train loss 0.72689. lr 3.000000e-03:  65%|██████▌   | 24706/37726 [10:08<05:42, 37.98it/s]

divide -4 by 13:-0.3047_


epoch 1 iter 24807: train loss 0.74121. lr 3.000000e-03:  66%|██████▌   | 24807/37726 [10:11<05:50, 36.86it/s]

add 3 and 432:435_


epoch 1 iter 24908: train loss 0.74954. lr 3.000000e-03:  66%|██████▌   | 24905/37726 [10:13<05:51, 36.51it/s]

divide -57 by 293:-0.1622_


epoch 1 iter 25007: train loss 0.73070. lr 3.000000e-03:  66%|██████▋   | 25008/37726 [10:16<05:43, 37.01it/s]

add -30 and -24:-54_


epoch 1 iter 25107: train loss 0.69842. lr 3.000000e-03:  67%|██████▋   | 25105/37726 [10:18<05:45, 36.49it/s]

multiply -53 by 232:-12166_


epoch 1 iter 25207: train loss 0.71009. lr 3.000000e-03:  67%|██████▋   | 25205/37726 [10:21<05:41, 36.69it/s]

divide 2 by 304:0.0067_


epoch 1 iter 25307: train loss 0.73470. lr 3.000000e-03:  67%|██████▋   | 25307/37726 [10:23<05:35, 37.07it/s]

divide -59 by -109:0.5247_


epoch 1 iter 25407: train loss 0.74585. lr 3.000000e-03:  67%|██████▋   | 25406/37726 [10:26<05:23, 38.09it/s]

subtract -17 from 24:30_


epoch 1 iter 25507: train loss 0.74070. lr 3.000000e-03:  68%|██████▊   | 25506/37726 [10:28<05:30, 36.97it/s]

multiply -430 by -377:1712910_


epoch 1 iter 25607: train loss 0.73473. lr 3.000000e-03:  68%|██████▊   | 25604/37726 [10:31<05:31, 36.57it/s]

subtract -2 from -436:-434_


epoch 1 iter 25707: train loss 0.70264. lr 3.000000e-03:  68%|██████▊   | 25705/37726 [10:33<05:26, 36.78it/s]

subtract -47 from 438:555_


epoch 1 iter 25807: train loss 0.71902. lr 3.000000e-03:  68%|██████▊   | 25807/37726 [10:36<05:16, 37.67it/s]

add 360 and -187:213_


epoch 1 iter 25907: train loss 0.73052. lr 3.000000e-03:  69%|██████▊   | 25908/37726 [10:38<05:18, 37.13it/s]

divide -15 by 272:-0.0585_


epoch 1 iter 26007: train loss 0.73235. lr 3.000000e-03:  69%|██████▉   | 26008/37726 [10:41<05:11, 37.63it/s]

multiply -27 by 47:-1259_


epoch 1 iter 26107: train loss 0.70425. lr 3.000000e-03:  69%|██████▉   | 26105/37726 [10:43<05:07, 37.74it/s]

multiply 1 by -152:-122_


epoch 1 iter 26207: train loss 0.72484. lr 3.000000e-03:  69%|██████▉   | 26205/37726 [10:46<05:12, 36.81it/s]

add -36 and 420:384_


epoch 1 iter 26307: train loss 0.71435. lr 3.000000e-03:  70%|██████▉   | 26307/37726 [10:48<05:13, 36.39it/s]

divide -10 by -456:0.0228_


epoch 1 iter 26407: train loss 0.70676. lr 3.000000e-03:  70%|██████▉   | 26404/37726 [10:51<05:26, 34.70it/s]

add 39 and -262:-223_


epoch 1 iter 26507: train loss 0.72619. lr 3.000000e-03:  70%|███████   | 26505/37726 [10:54<05:10, 36.11it/s]

add -454 and -282:-698_


epoch 1 iter 26607: train loss 0.71923. lr 3.000000e-03:  71%|███████   | 26606/37726 [10:56<05:15, 35.23it/s]

subtract 20 from 278:258_


epoch 1 iter 26707: train loss 0.72818. lr 3.000000e-03:  71%|███████   | 26705/37726 [10:59<05:01, 36.57it/s]

add 280 and -45:155_


epoch 1 iter 26807: train loss 0.71018. lr 3.000000e-03:  71%|███████   | 26806/37726 [11:01<04:56, 36.87it/s]

multiply -49 by 387:-18003_


epoch 1 iter 26907: train loss 0.75273. lr 3.000000e-03:  71%|███████▏  | 26907/37726 [11:04<05:01, 35.86it/s]

multiply -47 by -35:1555_


epoch 1 iter 27007: train loss 0.74858. lr 3.000000e-03:  72%|███████▏  | 27007/37726 [11:06<04:39, 38.42it/s]

subtract -124 from 284:508_


epoch 1 iter 27107: train loss 0.75329. lr 3.000000e-03:  72%|███████▏  | 27106/37726 [11:09<04:41, 37.68it/s]

add 380 and -422:-148_


epoch 1 iter 27207: train loss 0.76006. lr 3.000000e-03:  72%|███████▏  | 27207/37726 [11:11<04:41, 37.38it/s]

add 5 and 324:329_


epoch 1 iter 27307: train loss 0.72508. lr 3.000000e-03:  72%|███████▏  | 27306/37726 [11:14<04:32, 38.26it/s]

subtract -229 from -296:-77_


epoch 1 iter 27407: train loss 0.73455. lr 3.000000e-03:  73%|███████▎  | 27404/37726 [11:16<04:49, 35.61it/s]

add 12 and -36:-24_


epoch 1 iter 27507: train loss 0.75376. lr 3.000000e-03:  73%|███████▎  | 27504/37726 [11:19<04:43, 36.08it/s]

subtract -5 from -268:-263_


epoch 1 iter 27607: train loss 0.73571. lr 3.000000e-03:  73%|███████▎  | 27604/37726 [11:21<04:32, 37.14it/s]

multiply -16 by -412:6422_


epoch 1 iter 27707: train loss 0.71091. lr 3.000000e-03:  73%|███████▎  | 27704/37726 [11:24<04:29, 37.21it/s]

add -17 and -28:-45_


epoch 1 iter 27807: train loss 0.71227. lr 3.000000e-03:  74%|███████▎  | 27806/37726 [11:26<04:39, 35.43it/s]

multiply 45 by 394:17180_


epoch 1 iter 27907: train loss 0.70419. lr 3.000000e-03:  74%|███████▍  | 27908/37726 [11:29<04:26, 36.80it/s]

divide -86 by -3:28.0000_


epoch 1 iter 28007: train loss 0.74206. lr 3.000000e-03:  74%|███████▍  | 28005/37726 [11:31<04:28, 36.24it/s]

divide 31 by -67:-0.5031_


epoch 1 iter 28106: train loss 0.72686. lr 3.000000e-03:  74%|███████▍  | 28105/37726 [11:34<04:27, 35.94it/s]

subtract 4 from 459:455_


epoch 1 iter 28208: train loss 0.71307. lr 3.000000e-03:  75%|███████▍  | 28204/37726 [11:36<04:37, 34.26it/s]

multiply -5 by -258:1290_


epoch 1 iter 28307: train loss 0.76493. lr 3.000000e-03:  75%|███████▌  | 28307/37726 [11:39<04:16, 36.69it/s]

divide -30 by -229:0.1424_


epoch 1 iter 28407: train loss 0.70324. lr 3.000000e-03:  75%|███████▌  | 28405/37726 [11:41<04:15, 36.44it/s]

multiply 22 by 94:1118_


epoch 1 iter 28507: train loss 0.72662. lr 3.000000e-03:  76%|███████▌  | 28507/37726 [11:44<04:03, 37.94it/s]

multiply -339 by -369:142782_


epoch 1 iter 28607: train loss 0.73870. lr 3.000000e-03:  76%|███████▌  | 28608/37726 [11:46<04:11, 36.23it/s]

add 3 and 280:383_


epoch 1 iter 28707: train loss 0.71589. lr 3.000000e-03:  76%|███████▌  | 28708/37726 [11:49<04:01, 37.34it/s]

subtract 5 from -106:-101_


epoch 1 iter 28807: train loss 0.71842. lr 3.000000e-03:  76%|███████▋  | 28804/37726 [11:51<04:07, 36.07it/s]

divide -408 by -416:1.0835_


epoch 1 iter 28907: train loss 0.73391. lr 3.000000e-03:  77%|███████▋  | 28906/37726 [11:54<04:07, 35.66it/s]

divide -353 by 105:-3.3479_


epoch 1 iter 29007: train loss 0.72014. lr 3.000000e-03:  77%|███████▋  | 29008/37726 [11:57<03:56, 36.79it/s]

subtract -361 from -321:140_


epoch 1 iter 29107: train loss 0.72416. lr 3.000000e-03:  77%|███████▋  | 29107/37726 [11:59<04:02, 35.48it/s]

divide 45 by -19:-2.6211_


epoch 1 iter 29207: train loss 0.72948. lr 3.000000e-03:  77%|███████▋  | 29208/37726 [12:02<03:54, 36.30it/s]

divide -5 by 89:-0.0579_


epoch 1 iter 29307: train loss 0.72088. lr 3.000000e-03:  78%|███████▊  | 29306/37726 [12:04<03:49, 36.73it/s]

multiply -42 by 5:-210_


epoch 1 iter 29407: train loss 0.74512. lr 3.000000e-03:  78%|███████▊  | 29407/37726 [12:07<03:44, 37.09it/s]

multiply 200 by -101:-2000_


epoch 1 iter 29507: train loss 0.73241. lr 3.000000e-03:  78%|███████▊  | 29505/37726 [12:09<03:45, 36.47it/s]

multiply -344 by -15:4790_


epoch 1 iter 29607: train loss 0.74181. lr 3.000000e-03:  78%|███████▊  | 29607/37726 [12:12<03:39, 36.99it/s]

divide 433 by 1:433.0000_


epoch 1 iter 29707: train loss 0.73494. lr 3.000000e-03:  79%|███████▊  | 29707/37726 [12:14<03:39, 36.54it/s]

multiply 229 by -271:-73589_


epoch 1 iter 29807: train loss 0.74168. lr 3.000000e-03:  79%|███████▉  | 29806/37726 [12:17<03:33, 37.11it/s]

add -396 and -386:-782_


epoch 1 iter 29907: train loss 0.70638. lr 3.000000e-03:  79%|███████▉  | 29905/37726 [12:20<03:35, 36.32it/s]

divide 171 by -270:-0.5993_


epoch 1 iter 30007: train loss 0.74063. lr 3.000000e-03:  80%|███████▉  | 30008/37726 [12:22<03:30, 36.72it/s]

subtract -176 from -214:-22_


epoch 1 iter 30107: train loss 0.72069. lr 3.000000e-03:  80%|███████▉  | 30106/37726 [12:25<03:22, 37.63it/s]

divide 34 by 8:4.5000_


epoch 1 iter 30207: train loss 0.70045. lr 3.000000e-03:  80%|████████  | 30206/37726 [12:27<03:20, 37.56it/s]

subtract 4 from 461:457_


epoch 1 iter 30307: train loss 0.73767. lr 3.000000e-03:  80%|████████  | 30305/37726 [12:30<03:24, 36.24it/s]

divide 32 by 10:4.2000_


epoch 1 iter 30407: train loss 0.72402. lr 3.000000e-03:  81%|████████  | 30408/37726 [12:32<03:25, 35.65it/s]

divide 17 by -14:-1.0000_


epoch 1 iter 30507: train loss 0.72508. lr 3.000000e-03:  81%|████████  | 30507/37726 [12:35<03:17, 36.51it/s]

subtract 49 from 427:378_


epoch 1 iter 30607: train loss 0.71701. lr 3.000000e-03:  81%|████████  | 30606/37726 [12:37<03:12, 36.98it/s]

add -36 and 5:-31_


epoch 1 iter 30707: train loss 0.73272. lr 3.000000e-03:  81%|████████▏ | 30706/37726 [12:40<03:11, 36.71it/s]

multiply 1 by -419:-419_


epoch 1 iter 30807: train loss 0.72897. lr 3.000000e-03:  82%|████████▏ | 30808/37726 [12:42<03:03, 37.68it/s]

multiply -238 by -4:942_


epoch 1 iter 30907: train loss 0.69704. lr 3.000000e-03:  82%|████████▏ | 30906/37726 [12:45<03:13, 35.24it/s]

add 40 and 10:50_


epoch 1 iter 31007: train loss 0.71955. lr 3.000000e-03:  82%|████████▏ | 31006/37726 [12:48<03:02, 36.82it/s]

multiply -26 by 280:-7640_


epoch 1 iter 31107: train loss 0.73762. lr 3.000000e-03:  82%|████████▏ | 31105/37726 [12:50<03:02, 36.32it/s]

divide -265 by 174:-1.1265_


epoch 1 iter 31207: train loss 0.74470. lr 3.000000e-03:  83%|████████▎ | 31206/37726 [12:53<02:55, 37.17it/s]

subtract -41 from 225:266_


epoch 1 iter 31307: train loss 0.75429. lr 3.000000e-03:  83%|████████▎ | 31305/37726 [12:55<02:59, 35.82it/s]

divide -308 by 13:-24.7692_


epoch 1 iter 31407: train loss 0.72974. lr 3.000000e-03:  83%|████████▎ | 31404/37726 [12:58<03:04, 34.24it/s]

divide -17 by -42:0.3952_


epoch 1 iter 31507: train loss 0.72018. lr 3.000000e-03:  84%|████████▎ | 31506/37726 [13:00<02:50, 36.56it/s]

multiply -43 by 242:-10036_


epoch 1 iter 31607: train loss 0.73268. lr 3.000000e-03:  84%|████████▍ | 31608/37726 [13:03<02:49, 36.06it/s]

add 1 and 405:406_


epoch 1 iter 31707: train loss 0.73969. lr 3.000000e-03:  84%|████████▍ | 31704/37726 [13:05<02:46, 36.14it/s]

add 451 and 435:886_


epoch 1 iter 31807: train loss 0.73015. lr 3.000000e-03:  84%|████████▍ | 31808/37726 [13:08<02:41, 36.73it/s]

divide 169 by -478:-0.3551_


epoch 1 iter 31907: train loss 0.72966. lr 3.000000e-03:  85%|████████▍ | 31907/37726 [13:10<02:35, 37.49it/s]

add -13 and 201:198_


epoch 1 iter 32007: train loss 0.70132. lr 3.000000e-03:  85%|████████▍ | 32006/37726 [13:13<02:40, 35.54it/s]

multiply -186 by -11:1716_


epoch 1 iter 32106: train loss 0.70040. lr 3.000000e-03:  85%|████████▌ | 32105/37726 [13:16<02:32, 36.80it/s]

add 20 and -4:16_


epoch 1 iter 32207: train loss 0.70855. lr 3.000000e-03:  85%|████████▌ | 32208/37726 [13:18<02:31, 36.50it/s]

subtract -473 from 2:475_


epoch 1 iter 32307: train loss 0.74016. lr 3.000000e-03:  86%|████████▌ | 32306/37726 [13:21<02:29, 36.18it/s]

divide 473 by 394:1.6339_


epoch 1 iter 32407: train loss 0.70466. lr 3.000000e-03:  86%|████████▌ | 32406/37726 [13:23<02:26, 36.36it/s]

multiply -86 by -107:8498_


epoch 1 iter 32507: train loss 0.71059. lr 3.000000e-03:  86%|████████▌ | 32507/37726 [13:26<02:25, 35.81it/s]

divide -15 by 307:-0.0576_


epoch 1 iter 32607: train loss 0.72106. lr 3.000000e-03:  86%|████████▋ | 32606/37726 [13:28<02:20, 36.43it/s]

multiply 105 by -86:-192150_


epoch 1 iter 32707: train loss 0.71484. lr 3.000000e-03:  87%|████████▋ | 32706/37726 [13:31<02:20, 35.60it/s]

multiply -34 by 217:-8378_


epoch 1 iter 32807: train loss 0.72467. lr 3.000000e-03:  87%|████████▋ | 32808/37726 [13:33<02:17, 35.69it/s]

multiply -380 by -361:17420_


epoch 1 iter 32907: train loss 0.68619. lr 3.000000e-03:  87%|████████▋ | 32906/37726 [13:36<02:18, 34.89it/s]

add -49 and -44:-73_


epoch 1 iter 33007: train loss 0.72575. lr 3.000000e-03:  87%|████████▋ | 33006/37726 [13:39<02:05, 37.50it/s]

subtract -5 from 218:213_


epoch 1 iter 33107: train loss 0.69104. lr 3.000000e-03:  88%|████████▊ | 33104/37726 [13:41<02:15, 34.22it/s]

subtract 116 from -50:-176_


epoch 1 iter 33207: train loss 0.73084. lr 3.000000e-03:  88%|████████▊ | 33207/37726 [13:44<02:00, 37.42it/s]

add 32 and 25:57_


epoch 1 iter 33307: train loss 0.71728. lr 3.000000e-03:  88%|████████▊ | 33308/37726 [13:46<02:04, 35.60it/s]

add 4 and -207:-263_


epoch 1 iter 33407: train loss 0.73684. lr 3.000000e-03:  89%|████████▊ | 33405/37726 [13:49<01:59, 36.13it/s]

multiply -154 by -24:3927_


epoch 1 iter 33507: train loss 0.72774. lr 3.000000e-03:  89%|████████▉ | 33507/37726 [13:51<01:56, 36.37it/s]

divide 316 by -106:-3.6442_


epoch 1 iter 33607: train loss 0.71358. lr 3.000000e-03:  89%|████████▉ | 33608/37726 [13:54<01:51, 36.83it/s]

subtract -2 from -238:-236_


epoch 1 iter 33707: train loss 0.70625. lr 3.000000e-03:  89%|████████▉ | 33706/37726 [13:56<01:47, 37.42it/s]

divide 49 by -362:-0.1585_


epoch 1 iter 33807: train loss 0.71794. lr 3.000000e-03:  90%|████████▉ | 33808/37726 [13:59<01:45, 37.18it/s]

subtract 44 from -42:-86_


epoch 1 iter 33907: train loss 0.72476. lr 3.000000e-03:  90%|████████▉ | 33905/37726 [14:02<01:47, 35.65it/s]

multiply -158 by 3:-464_


epoch 1 iter 34007: train loss 0.70402. lr 3.000000e-03:  90%|█████████ | 34007/37726 [14:04<01:46, 34.90it/s]

multiply -130 by -215:274450_


epoch 1 iter 34107: train loss 0.72328. lr 3.000000e-03:  90%|█████████ | 34108/37726 [14:07<01:36, 37.36it/s]

add -40 and 45:4_


epoch 1 iter 34207: train loss 0.72631. lr 3.000000e-03:  91%|█████████ | 34204/37726 [14:09<01:42, 34.49it/s]

multiply 12 by -66:-832_


epoch 1 iter 34307: train loss 0.72435. lr 3.000000e-03:  91%|█████████ | 34305/37726 [14:12<01:35, 35.85it/s]

subtract -14 from 18:32_


epoch 1 iter 34407: train loss 0.72166. lr 3.000000e-03:  91%|█████████ | 34404/37726 [14:14<01:37, 34.15it/s]

divide -32 by -324:0.0966_


epoch 1 iter 34507: train loss 0.70510. lr 3.000000e-03:  91%|█████████▏| 34508/37726 [14:17<01:27, 36.62it/s]

multiply -5 by 424:-2120_


epoch 1 iter 34607: train loss 0.72400. lr 3.000000e-03:  92%|█████████▏| 34605/37726 [14:19<01:28, 35.09it/s]

multiply 471 by -33:-15743_


epoch 1 iter 34707: train loss 0.72075. lr 3.000000e-03:  92%|█████████▏| 34708/37726 [14:22<01:20, 37.63it/s]

divide 403 by 1874:_


epoch 1 iter 34807: train loss 0.73517. lr 3.000000e-03:  92%|█████████▏| 34808/37726 [14:25<01:20, 36.46it/s]

add 48 and -490:-442_


epoch 1 iter 34907: train loss 0.72463. lr 3.000000e-03:  93%|█████████▎| 34908/37726 [14:27<01:21, 34.76it/s]

divide 27 by -48:-0.5833_


epoch 1 iter 35007: train loss 0.71425. lr 3.000000e-03:  93%|█████████▎| 35006/37726 [14:30<01:14, 36.75it/s]

subtract 118 from -26:-134_


epoch 1 iter 35107: train loss 0.71926. lr 3.000000e-03:  93%|█████████▎| 35105/37726 [14:32<01:13, 35.73it/s]

divide -4 by 225:-0.0176_


epoch 1 iter 35207: train loss 0.72512. lr 3.000000e-03:  93%|█████████▎| 35207/37726 [14:35<01:10, 35.54it/s]

subtract -14 from -473:-469_


epoch 1 iter 35307: train loss 0.70018. lr 3.000000e-03:  94%|█████████▎| 35308/37726 [14:37<01:05, 36.90it/s]

multiply -101 by -140:18410_


epoch 1 iter 35407: train loss 0.74215. lr 3.000000e-03:  94%|█████████▍| 35404/37726 [14:40<01:04, 35.98it/s]

add 305 and 212:627_


epoch 1 iter 35507: train loss 0.72158. lr 3.000000e-03:  94%|█████████▍| 35507/37726 [14:43<01:00, 36.93it/s]

divide -50 by 92:-0.8212_


epoch 1 iter 35607: train loss 0.73993. lr 3.000000e-03:  94%|█████████▍| 35604/37726 [14:45<01:01, 34.67it/s]

subtract -337 from 308:617_


epoch 1 iter 35707: train loss 0.70149. lr 3.000000e-03:  95%|█████████▍| 35704/37726 [14:48<00:59, 34.08it/s]

divide -1 by 255:-0.0038_


epoch 1 iter 35807: train loss 0.71296. lr 3.000000e-03:  95%|█████████▍| 35807/37726 [14:50<00:51, 37.59it/s]

multiply -4 by 89:-356_


epoch 1 iter 35907: train loss 0.70427. lr 3.000000e-03:  95%|█████████▌| 35906/37726 [14:53<00:51, 35.22it/s]

divide 29 by 439:0.0647_


epoch 1 iter 36007: train loss 0.72276. lr 3.000000e-03:  95%|█████████▌| 36005/37726 [14:55<00:48, 35.66it/s]

add -1 and 91:90_


epoch 1 iter 36107: train loss 0.72885. lr 3.000000e-03:  96%|█████████▌| 36108/37726 [14:58<00:46, 35.05it/s]

divide 59 by 391:0.1630_


epoch 1 iter 36207: train loss 0.72284. lr 3.000000e-03:  96%|█████████▌| 36208/37726 [15:01<00:43, 35.26it/s]

subtract 6 from -493:-499_


epoch 1 iter 36307: train loss 0.73589. lr 3.000000e-03:  96%|█████████▌| 36308/37726 [15:03<00:38, 36.68it/s]

multiply -425 by -227:10445_


epoch 1 iter 36407: train loss 0.74295. lr 3.000000e-03:  97%|█████████▋| 36408/37726 [15:06<00:37, 34.93it/s]

multiply -36 by 20:-720_


epoch 1 iter 36507: train loss 0.72520. lr 3.000000e-03:  97%|█████████▋| 36506/37726 [15:08<00:34, 35.50it/s]

multiply -5 by -449:2245_


epoch 1 iter 36606: train loss 0.70521. lr 3.000000e-03:  97%|█████████▋| 36607/37726 [15:11<00:33, 33.17it/s]

divide 271 by -236:-1.2198_


epoch 1 iter 36707: train loss 0.69765. lr 3.000000e-03:  97%|█████████▋| 36708/37726 [15:14<00:28, 35.72it/s]

multiply -253 by -403:91575_


epoch 1 iter 36807: train loss 0.72191. lr 3.000000e-03:  98%|█████████▊| 36806/37726 [15:16<00:25, 36.10it/s]

divide -298 by 203:-1.4045_


epoch 1 iter 36907: train loss 0.74362. lr 3.000000e-03:  98%|█████████▊| 36905/37726 [15:19<00:23, 35.45it/s]

subtract -2 from -159:-157_


epoch 1 iter 37007: train loss 0.70686. lr 3.000000e-03:  98%|█████████▊| 37005/37726 [15:21<00:20, 36.02it/s]

divide -1 by -300:0.0033_


epoch 1 iter 37107: train loss 0.71596. lr 3.000000e-03:  98%|█████████▊| 37107/37726 [15:24<00:17, 35.99it/s]

multiply -37 by 69:-20413_


epoch 1 iter 37207: train loss 0.72811. lr 3.000000e-03:  99%|█████████▊| 37207/37726 [15:26<00:14, 34.64it/s]

divide -21 by 233:-0.0970_


epoch 1 iter 37307: train loss 0.71064. lr 3.000000e-03:  99%|█████████▉| 37308/37726 [15:29<00:11, 37.87it/s]

multiply -4 by -223:912_


epoch 1 iter 37407: train loss 0.71380. lr 3.000000e-03:  99%|█████████▉| 37405/37726 [15:31<00:08, 36.19it/s]

divide 2 by -237:-0.0074_


epoch 1 iter 37507: train loss 0.71324. lr 3.000000e-03:  99%|█████████▉| 37507/37726 [15:34<00:05, 37.12it/s]

add -46 and 408:362_


epoch 1 iter 37607: train loss 0.72092. lr 3.000000e-03: 100%|█████████▉| 37605/37726 [15:36<00:03, 35.86it/s]

subtract 375 from -34:-499_


epoch 1 iter 37707: train loss 0.72372. lr 3.000000e-03: 100%|█████████▉| 37708/37726 [15:39<00:00, 36.99it/s]

add 17 and 32:69_


epoch 1 iter 37725: train loss 0.72390. lr 3.000000e-03: 100%|██████████| 37726/37726 [15:40<00:00, 40.13it/s]


In [6]:
(xs, ys), _ = generate_data(30, difficulty=3, extra=False)
for x,y in zip(xs, ys): 
    yhat = trainer.sample(x+':')[:-1]
    i = float(yhat.split(':')[-1])
    print(('✅\t\t' if float(y)==i else '❌ err: '+str(float(y)-i)+'\t')+yhat)

✅		divide -33 by 5:-6.6000
✅		multiply 1 by 43:43
✅		add 34 and 454:488
✅		add 3 and 347:350
✅		add 3 and -456:-453
✅		subtract -421 from -4:417
✅		subtract 3 from -49:-52
❌ err: 203.0	add 223 and 2:22
❌ err: -8.0	add 49 and 2:59
✅		add 27 and -4:23
❌ err: 2.0	add 4 and -2:-0
❌ err: 80.0	multiply 21 by -49:-1109
✅		add 20 and -4:16
✅		multiply -38 by -3:114
✅		divide 2 by 459:0.0044
✅		multiply -7 by -2:14
❌ err: 0.2065999999999999	divide 329 by -24:-13.9149
✅		add 206 and 1:207
✅		add -3 and -16:-19
✅		divide -2 by 23:-0.0870
❌ err: -40.0	multiply 14 by -196:-2704
✅		subtract 2 from -50:-52
❌ err: -30.0	multiply -271 by 4:-1054
✅		divide 43 by 1:43.0000
