In [1]:
import os

os.environ['WANDB_API_KEY'] = ''

In [2]:
import pip

def install(package):
    if hasattr(pip, 'main'):
        pip.main(['install', package])
    else:
        pip._internal.main(['install', package])
        
install('wandb')

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [3]:
from transformers import (
    AutoConfig, 
    AutoModelForCausalLM, 
    AutoTokenizer, 
    default_data_collator,
    TrainingArguments,
    Trainer,
    TrainerCallback
)
from datasets import load_dataset, load_from_disk

import ray
from ray import train
from ray.train import Checkpoint
from ray.train.huggingface import TransformersTrainer
from ray.train import ScalingConfig
from ray.train.torch import TorchTrainer, get_device
from transformers.trainer_utils import get_last_checkpoint
import torch
from torch.utils.data import DataLoader
import numpy as np
import os
import shutil
import wandb

wandb_api = wandb.Api()

In [4]:
WANDB_PROJECT = os.environ.get('WANDB_PROJECT', 'run-ray')
WANDB_API_KEY = os.environ.get('WANDB_API_KEY', wandb_api.api_key)
MINIO = os.environ.get('MINIO', 'http://minio:9000')

In [5]:
def train_func(config):
    
    from streaming.base.format.mds.encodings import Encoding, _encodings
    from streaming import StreamingDataset
    import streaming
    import s3fs
    
    MINIO = os.environ.get('MINIO', 'http://minio:9000')
    fs = s3fs.S3FileSystem(endpoint_url = MINIO, anon = True)
    
    device = str(get_device()).replace(':', '-')
    print(device)

    class UInt16(Encoding):
        def encode(self, obj) -> bytes:
            return obj.tobytes()

        def decode(self, data: bytes):
            return np.frombuffer(data, np.uint16)

    _encodings['uint16'] = UInt16
    
    class DatasetFixed(torch.utils.data.Dataset):
        def __init__(self, local, remote):
            
            streaming.base.util.clean_stale_shared_memory()
            self.dataset = StreamingDataset(local=local, remote = remote, download_timeout = 300)

        def __getitem__(self, idx):
            data = self.dataset[idx]
            data['labels'] = data["input_ids"].copy()
            data.pop('token_type_ids', None)
            for k in data.keys():
                data[k] = data[k].astype(np.int64)
            return data

        def __len__(self):
            return len(self.dataset)
    
    local = os.path.join('/home/ubuntu/storage', config['local'])
    shutil.rmtree(local, ignore_errors = True)
    train_dataset = DatasetFixed(local=local, remote=config['remote'])
    
    # https://github.com/mosaicml/streaming/issues/307#issuecomment-1729829065
    def inf_loop_dataloader(dataloader: torch.utils.data.DataLoader):
        while True:
            for batch in dataloader:
                yield batch
    dataloader = DataLoader(train_dataset, batch_size=2)
    dataset_iterator = iter(inf_loop_dataloader(dataloader))
    batch = next(iter(dataset_iterator))

#     train_dataset = load_from_disk('s3://train/wiki-test', 
#                                  storage_options=fs.storage_options, 
#                                  keep_in_memory = False)
    
    MODEL_NAME = "gpt2"
    model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
    directory = f'{MODEL_NAME}-test'
    
    output_dir = os.path.join('/home/ubuntu/storage', directory)
    output_temp = os.path.join('/home/ubuntu/storage', 'temp' + device)
    s3_bucket = 'train'
    s3_output_dir = os.path.join(s3_bucket, directory)
    
    args = TrainingArguments(
        output_dir=output_dir,
        save_strategy="steps",
        logging_strategy="steps",
        learning_rate=2e-5,
        weight_decay=0.01,
        max_steps=10000,
        save_steps = 100,
        save_total_limit = 2,
        logging_steps = 1,
        per_device_train_batch_size = 6,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        data_collator=default_data_collator,
    )
    
    
    class S3Callback(TrainerCallback):
        def on_save(self, args, state, control, **kwargs):
            fs.delete(s3_output_dir)
            fs.put(output_dir, s3_bucket, recursive=True)
        
    trainer.add_callback(S3Callback())
    
    checkpoints = []
    try:
        checkpoints = fs.ls(s3_output_dir)
        checkpoints = [f for f in checkpoints if 'checkpoint-' in f]
        checkpoints = sorted(checkpoints, key = lambda x: int(x.split('-')[-1]))
    except:
        pass
    
    if len(checkpoints):
        checkpoint = checkpoints[-1]
        print(f'load checkpoint from {checkpoint} into {output_temp}')
        shutil.rmtree(output_temp, ignore_errors = True)
        fs.get(checkpoint, output_temp, recursive = True)
        trainer.train(resume_from_checkpoint=output_temp)
    else:
        trainer.train()

In [6]:
runtime_env = {
    'pip': ['wandb', 's3fs', 'mosaicml-streaming'],
    'env_vars': {
        'WANDB_PROJECT': WANDB_PROJECT,
        'WANDB_API_KEY': WANDB_API_KEY,
        'S3_ENDPOINT_URL': MINIO,
        'MINIO': MINIO,
    }
}

In [7]:
class RayConnection:
    def __init__(self, address, **kwargs):
        ray.init(address=address, **kwargs)

    def __enter__(self):
        return self

    def __exit__(self, typ, value, traceback):
        ray.shutdown()

In [None]:
with RayConnection("ray://localhost:10001", runtime_env=runtime_env):
    scaling_config = ScalingConfig(
        num_workers=8, 
        use_gpu=True,
    )
    run_config = train.RunConfig(failure_config=train.FailureConfig(max_failures=-1))
    ray_trainer = TorchTrainer(
        train_func,
        train_loop_config={
            'local': 'local_dir',
            'remote': 's3://train/indexed'
        },
        scaling_config=scaling_config,
        run_config=run_config

    )
    result = ray_trainer.fit()

[36m(TunerInternal pid=33311)[0m [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


[36m(TunerInternal pid=33311)[0m 
[36m(TunerInternal pid=33311)[0m View detailed results here: /home/ubuntu/ray_results/TorchTrainer_2023-11-22_17-57-28
[36m(TunerInternal pid=33311)[0m To visualize your results with TensorBoard, run: `tensorboard --logdir /home/ubuntu/ray_results/TorchTrainer_2023-11-22_17-57-28`


[36m(TunerInternal pid=33311)[0m AIR_VERBOSITY is set, ignoring passed-in ProgressReporter for now.


[36m(TunerInternal pid=33311)[0m 
[36m(TunerInternal pid=33311)[0m Training started with configuration:
[36m(TunerInternal pid=33311)[0m ╭───────────────────────────────────────────────╮
[36m(TunerInternal pid=33311)[0m │ Training config                               │
[36m(TunerInternal pid=33311)[0m ├───────────────────────────────────────────────┤
[36m(TunerInternal pid=33311)[0m │ train_loop_config/local             local_dir │
[36m(TunerInternal pid=33311)[0m │ train_loop_config/remote   s3://train/indexed │
[36m(TunerInternal pid=33311)[0m ╰───────────────────────────────────────────────╯


[36m(TorchTrainer pid=33520)[0m Starting distributed worker processes: ['33620 (10.208.0.249)', '33621 (10.208.0.249)', '33625 (10.208.0.249)', '33626 (10.208.0.249)', '3487 (10.208.0.238)', '3488 (10.208.0.238)', '3489 (10.208.0.238)', '3490 (10.208.0.238)']
[36m(RayTrainWorker pid=33620)[0m Setting up process group for: env:// [rank=0, world_size=8]


[36m(RayTrainWorker pid=3489, ip=10.208.0.238)[0m cuda-2
[36m(RayTrainWorker pid=3488, ip=10.208.0.238)[0m cuda-1
[36m(RayTrainWorker pid=3490, ip=10.208.0.238)[0m cuda-3
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m cuda-0
[36m(RayTrainWorker pid=33626)[0m cuda-3
[36m(RayTrainWorker pid=33621)[0m cuda-1
[36m(RayTrainWorker pid=33625)[0m cuda-2
[36m(RayTrainWorker pid=33620)[0m cuda-0
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m load checkpoint from train/gpt2-test/checkpoint-500 into /home/ubuntu/storage/tempcuda-0
[36m(RayTrainWorker pid=33620)[0m load checkpoint from train/gpt2-test/checkpoint-500 into /home/ubuntu/storage/tempcuda-0
[36m(RayTrainWorker pid=3489, ip=10.208.0.238)[0m load checkpoint from train/gpt2-test/checkpoint-500 into /home/ubuntu/storage/tempcuda-2
[36m(RayTrainWorker pid=33621)[0m load checkpoint from train/gpt2-test/checkpoint-500 into /home/ubuntu/storage/tempcuda-1
[36m(RayTrainWorker pid=33625)[0m load checkpoint from tra

[36m(RayTrainWorker pid=3488, ip=10.208.0.238)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=3490, ip=10.208.0.238)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=33620)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=33621)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=3489, ip=10.208.0.238)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=33625)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
[36m(RayTrainWorker pid=33626)[0m There were missing keys in the checkpoint model loaded: ['lm_head.weight'].
  0%|          | 0/10000 [00:00<?, ?it/s

[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1623, 'learning_rate': 1.8998e-05, 'epoch': 0.77}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1623, 'learning_rate': 1.8998e-05, 'epoch': 0.77}


  5%|▌         | 502/10000 [00:01<00:23, 404.00it/s]
  5%|▌         | 502/10000 [00:03<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.17, 'learning_rate': 1.8996000000000002e-05, 'epoch': 0.77}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.17, 'learning_rate': 1.8996000000000002e-05, 'epoch': 0.77}


  5%|▌         | 503/10000 [00:02<00:23, 404.00it/s]
  5%|▌         | 503/10000 [00:04<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1151, 'learning_rate': 1.8994e-05, 'epoch': 0.77}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1151, 'learning_rate': 1.8994e-05, 'epoch': 0.77}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2032, 'learning_rate': 1.8992e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2032, 'learning_rate': 1.8992e-05, 'epoch': 0.78}


  5%|▌         | 504/10000 [00:04<00:51, 182.91it/s]
  5%|▌         | 504/10000 [00:03<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.389, 'learning_rate': 1.8990000000000003e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.389, 'learning_rate': 1.8990000000000003e-05, 'epoch': 0.78}


  5%|▌         | 505/10000 [00:05<00:51, 182.91it/s]
  5%|▌         | 505/10000 [00:03<00:23, 404.00it/s]
  5%|▌         | 506/10000 [00:04<00:23, 404.00it/s]
  5%|▌         | 506/10000 [00:06<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.3192, 'learning_rate': 1.8988000000000002e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.3192, 'learning_rate': 1.8988000000000002e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1087, 'learning_rate': 1.8986e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1087, 'learning_rate': 1.8986e-05, 'epoch': 0.78}


  5%|▌         | 507/10000 [00:06<00:51, 182.91it/s]
  5%|▌         | 507/10000 [00:05<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2713, 'learning_rate': 1.8984000000000003e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2713, 'learning_rate': 1.8984000000000003e-05, 'epoch': 0.78}


  5%|▌         | 508/10000 [00:07<00:51, 182.91it/s]
  5%|▌         | 508/10000 [00:06<00:23, 404.00it/s]
  5%|▌         | 509/10000 [00:06<00:23, 404.00it/s]
  5%|▌         | 509/10000 [00:08<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.3064, 'learning_rate': 1.8982000000000002e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.3064, 'learning_rate': 1.8982000000000002e-05, 'epoch': 0.78}


  5%|▌         | 510/10000 [00:07<00:23, 404.00it/s]
  5%|▌         | 510/10000 [00:08<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2174, 'learning_rate': 1.898e-05, 'epoch': 0.78}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2174, 'learning_rate': 1.898e-05, 'epoch': 0.78}


  5%|▌         | 511/10000 [00:08<00:23, 404.00it/s]
  5%|▌         | 511/10000 [00:09<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.3534, 'learning_rate': 1.8978e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.3534, 'learning_rate': 1.8978e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1192, 'learning_rate': 1.8976000000000003e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1192, 'learning_rate': 1.8976000000000003e-05, 'epoch': 0.79}


  5%|▌         | 512/10000 [00:10<00:51, 182.91it/s]
  5%|▌         | 512/10000 [00:08<00:23, 404.00it/s]
  5%|▌         | 513/10000 [00:09<00:23, 404.00it/s]
  5%|▌         | 513/10000 [00:11<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.34, 'learning_rate': 1.8974e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.34, 'learning_rate': 1.8974e-05, 'epoch': 0.79}


  5%|▌         | 514/10000 [00:10<00:23, 404.00it/s]
  5%|▌         | 514/10000 [00:11<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1637, 'learning_rate': 1.8972e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1637, 'learning_rate': 1.8972e-05, 'epoch': 0.79}


  5%|▌         | 515/10000 [00:11<00:23, 404.00it/s]
  5%|▌         | 515/10000 [00:12<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1746, 'learning_rate': 1.8970000000000003e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1746, 'learning_rate': 1.8970000000000003e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2902, 'learning_rate': 1.8968000000000002e-05, 'epoch': 0.79}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2902, 'learning_rate': 1.8968000000000002e-05, 'epoch': 0.79}


  5%|▌         | 516/10000 [00:13<00:51, 182.91it/s]
  5%|▌         | 516/10000 [00:11<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.307, 'learning_rate': 1.8966e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.307, 'learning_rate': 1.8966e-05, 'epoch': 0.8}


  5%|▌         | 517/10000 [00:14<00:51, 182.91it/s]
  5%|▌         | 517/10000 [00:12<00:23, 404.00it/s]
  5%|▌         | 518/10000 [00:13<00:23, 404.00it/s]
  5%|▌         | 518/10000 [00:14<00:51, 182.91it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1216, 'learning_rate': 1.8964000000000003e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1216, 'learning_rate': 1.8964000000000003e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1898, 'learning_rate': 1.8962000000000002e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1898, 'learning_rate': 1.8962000000000002e-05, 'epoch': 0.8}


  5%|▌         | 519/10000 [00:15<00:51, 182.91it/s]
  5%|▌         | 519/10000 [00:14<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2323, 'learning_rate': 1.896e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2323, 'learning_rate': 1.896e-05, 'epoch': 0.8}


  5%|▌         | 520/10000 [00:16<06:38, 23.78it/s] 
  5%|▌         | 520/10000 [00:14<00:23, 404.00it/s]
  5%|▌         | 521/10000 [00:15<00:23, 404.00it/s]
  5%|▌         | 521/10000 [00:17<07:06, 22.24it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2345, 'learning_rate': 1.8958e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2345, 'learning_rate': 1.8958e-05, 'epoch': 0.8}


  5%|▌         | 522/10000 [00:16<00:23, 404.00it/s]
  5%|▌         | 522/10000 [00:17<07:06, 22.24it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1486, 'learning_rate': 1.8956e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1486, 'learning_rate': 1.8956e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1585, 'learning_rate': 1.8954000000000002e-05, 'epoch': 0.8}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1585, 'learning_rate': 1.8954000000000002e-05, 'epoch': 0.8}


  5%|▌         | 523/10000 [00:18<07:06, 22.24it/s]
  5%|▌         | 523/10000 [00:17<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2794, 'learning_rate': 1.8952e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2794, 'learning_rate': 1.8952e-05, 'epoch': 0.81}


  5%|▌         | 524/10000 [00:19<07:06, 22.24it/s]
  5%|▌         | 524/10000 [00:18<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2378, 'learning_rate': 1.8950000000000003e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2378, 'learning_rate': 1.8950000000000003e-05, 'epoch': 0.81}


  5%|▌         | 525/10000 [00:20<07:06, 22.24it/s] 
  5%|▌         | 525/10000 [00:18<00:23, 404.00it/s]
  5%|▌         | 526/10000 [00:19<00:23, 404.00it/s]
  5%|▌         | 526/10000 [00:21<07:05, 22.24it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1505, 'learning_rate': 1.8948000000000002e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1505, 'learning_rate': 1.8948000000000002e-05, 'epoch': 0.81}


  5%|▌         | 526/10000 [00:20<00:23, 404.00it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1301, 'learning_rate': 1.8946e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1301, 'learning_rate': 1.8946e-05, 'epoch': 0.81}


  5%|▌         | 527/10000 [00:22<07:05, 22.24it/s] 
  5%|▌         | 527/10000 [00:20<08:26, 18.70it/s] 
  5%|▌         | 528/10000 [00:21<08:53, 17.76it/s]
  5%|▌         | 528/10000 [00:22<07:05, 22.24it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.3755, 'learning_rate': 1.8944000000000004e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.3755, 'learning_rate': 1.8944000000000004e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2046, 'learning_rate': 1.8942000000000003e-05, 'epoch': 0.81}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2046, 'learning_rate': 1.8942000000000003e-05, 'epoch': 0.81}


  5%|▌         | 529/10000 [00:23<11:58, 13.18it/s]
  5%|▌         | 529/10000 [00:21<08:53, 17.76it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1236, 'learning_rate': 1.894e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1236, 'learning_rate': 1.894e-05, 'epoch': 0.82}


  5%|▌         | 530/10000 [00:24<11:58, 13.18it/s] 
  5%|▌         | 530/10000 [00:22<08:53, 17.76it/s]
  5%|▌         | 531/10000 [00:23<08:53, 17.76it/s]
  5%|▌         | 531/10000 [00:24<11:58, 13.18it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1462, 'learning_rate': 1.8938e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1462, 'learning_rate': 1.8938e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2004, 'learning_rate': 1.8936e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2004, 'learning_rate': 1.8936e-05, 'epoch': 0.82}


  5%|▌         | 532/10000 [00:25<11:58, 13.18it/s] 
  5%|▌         | 532/10000 [00:23<08:52, 17.76it/s]
  5%|▌         | 533/10000 [00:24<08:52, 17.76it/s]
  5%|▌         | 533/10000 [00:26<11:58, 13.18it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2413, 'learning_rate': 1.8934000000000002e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2413, 'learning_rate': 1.8934000000000002e-05, 'epoch': 0.82}


  5%|▌         | 534/10000 [00:25<08:52, 17.76it/s]
  5%|▌         | 534/10000 [00:26<15:24, 10.24it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2863, 'learning_rate': 1.8932e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2863, 'learning_rate': 1.8932e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1856, 'learning_rate': 1.893e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1856, 'learning_rate': 1.893e-05, 'epoch': 0.82}


  5%|▌         | 535/10000 [00:27<15:24, 10.24it/s]
  5%|▌         | 535/10000 [00:25<08:52, 17.76it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1349, 'learning_rate': 1.8928000000000002e-05, 'epoch': 0.82}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1349, 'learning_rate': 1.8928000000000002e-05, 'epoch': 0.82}


  5%|▌         | 536/10000 [00:27<15:24, 10.24it/s] 
  5%|▌         | 536/10000 [00:26<08:52, 17.76it/s]
  5%|▌         | 537/10000 [00:26<08:52, 17.76it/s]
  5%|▌         | 537/10000 [00:28<17:37,  8.95it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2981, 'learning_rate': 1.8926e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2981, 'learning_rate': 1.8926e-05, 'epoch': 0.83}


  5%|▌         | 538/10000 [00:27<08:52, 17.76it/s]
  5%|▌         | 538/10000 [00:29<17:37,  8.95it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.083, 'learning_rate': 1.8924000000000004e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.083, 'learning_rate': 1.8924000000000004e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2628, 'learning_rate': 1.8922000000000003e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2628, 'learning_rate': 1.8922000000000003e-05, 'epoch': 0.83}


  5%|▌         | 539/10000 [00:30<20:37,  7.65it/s]
  5%|▌         | 539/10000 [00:28<08:52, 17.76it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.0872, 'learning_rate': 1.8920000000000002e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.0872, 'learning_rate': 1.8920000000000002e-05, 'epoch': 0.83}


  5%|▌         | 540/10000 [00:29<08:52, 17.76it/s]
  5%|▌         | 540/10000 [00:30<20:37,  7.65it/s] 
  5%|▌         | 541/10000 [00:31<24:12,  6.51it/s] 
  5%|▌         | 541/10000 [00:30<08:52, 17.76it/s]
  5%|▌         | 541/10000 [00:31<24:12,  6.51it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1142, 'learning_rate': 1.8918e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1142, 'learning_rate': 1.8918e-05, 'epoch': 0.83}


  5%|▌         | 542/10000 [00:30<08:52, 17.76it/s]
  5%|▌         | 542/10000 [00:32<27:11,  5.80it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2552, 'learning_rate': 1.8916e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2552, 'learning_rate': 1.8916e-05, 'epoch': 0.83}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1712, 'learning_rate': 1.8914000000000002e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1712, 'learning_rate': 1.8914000000000002e-05, 'epoch': 0.84}


  5%|▌         | 543/10000 [00:33<31:08,  5.06it/s] 
  5%|▌         | 543/10000 [00:31<08:52, 17.76it/s]
  5%|▌         | 544/10000 [00:32<08:52, 17.76it/s]
  5%|▌         | 544/10000 [00:33<34:02,  4.63it/s] 


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.0801, 'learning_rate': 1.8912e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.0801, 'learning_rate': 1.8912e-05, 'epoch': 0.84}


  5%|▌         | 545/10000 [00:33<17:17,  9.11it/s]
  5%|▌         | 545/10000 [00:34<40:07,  3.93it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1502, 'learning_rate': 1.891e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1502, 'learning_rate': 1.891e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1575, 'learning_rate': 1.8908000000000003e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1575, 'learning_rate': 1.8908000000000003e-05, 'epoch': 0.84}


  5%|▌         | 546/10000 [00:35<47:32,  3.31it/s]
  5%|▌         | 546/10000 [00:33<18:03,  8.73it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2535, 'learning_rate': 1.8906e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2535, 'learning_rate': 1.8906e-05, 'epoch': 0.84}


  5%|▌         | 547/10000 [00:36<56:15,  2.80it/s] 
  5%|▌         | 547/10000 [00:34<18:03,  8.73it/s]
  5%|▌         | 548/10000 [00:35<18:03,  8.73it/s]
  5%|▌         | 548/10000 [00:36<1:00:40,  2.60it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.133, 'learning_rate': 1.8904000000000004e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.133, 'learning_rate': 1.8904000000000004e-05, 'epoch': 0.84}


  5%|▌         | 549/10000 [00:37<1:11:12,  2.21it/s]
  5%|▌         | 549/10000 [00:36<18:03,  8.73it/s]
  5%|▌         | 549/10000 [00:37<1:11:12,  2.21it/s]


[36m(RayTrainWorker pid=33620)[0m {'loss': 7.2892, 'learning_rate': 1.8902000000000003e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.2892, 'learning_rate': 1.8902000000000003e-05, 'epoch': 0.84}
[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1592, 'learning_rate': 1.8900000000000002e-05, 'epoch': 0.85}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1592, 'learning_rate': 1.8900000000000002e-05, 'epoch': 0.85}


  6%|▌         | 550/10000 [00:38<1:21:33,  1.93it/s]
  6%|▌         | 550/10000 [00:36<18:03,  8.73it/s]


[36m(RayTrainWorker pid=3487, ip=10.208.0.238)[0m {'loss': 7.1803, 'learning_rate': 1.8898e-05, 'epoch': 0.85}
[36m(RayTrainWorker pid=33620)[0m {'loss': 7.1803, 'learning_rate': 1.8898e-05, 'epoch': 0.85}


  6%|▌         | 551/10000 [00:39<1:31:12,  1.73it/s]
  6%|▌         | 551/10000 [00:37<18:02,  8.73it/s]
