Finetunes [distilbert-base-uncased](https://huggingface.co/distilbert-base-uncased) on [MRPC](https://huggingface.co/datasets/glue/viewer/mrpc/train). Adapted from a [PyTorch Lightning example](https://lightning.ai/docs/pytorch/1.9.5/notebooks/lightning_examples/text-transformers.html).

In [1]:
%pip install -q torch transformers pytorch_lightning==1.9.5 datasets
%pip install wandb -qU
%pip install -q "ray[tune]" torchvision

^C
Note: you may need to restart the kernel to use updated packages.


In [2]:
from datetime import datetime
from typing import Optional

import wandb
import datasets
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
    get_inverse_sqrt_schedule,
    get_cosine_schedule_with_warmup,
    get_polynomial_decay_schedule_with_warmup
)
from pytorch_lightning import LightningDataModule, LightningModule, Trainer, seed_everything

from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)

from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
from ray.train.torch import TorchTrainer

In [None]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [3]:
class GLUEDataModule(LightningDataModule):
    task_text_field_map = {
        "cola": ["sentence"],
        "sst2": ["sentence"],
        "mrpc": ["sentence1", "sentence2"],
        "qqp": ["question1", "question2"],
        "stsb": ["sentence1", "sentence2"],
        "mnli": ["premise", "hypothesis"],
        "qnli": ["question", "sentence"],
        "rte": ["sentence1", "sentence2"],
        "wnli": ["sentence1", "sentence2"],
        "ax": ["premise", "hypothesis"],
    }

    glue_task_num_labels = {
        "cola": 2,
        "sst2": 2,
        "mrpc": 2,
        "qqp": 2,
        "stsb": 1,
        "mnli": 3,
        "qnli": 2,
        "rte": 2,
        "wnli": 2,
        "ax": 3,
    }

    loader_columns = [
        "datasets_idx",
        "input_ids",
        "token_type_ids",
        "attention_mask",
        "start_positions",
        "end_positions",
        "labels",
    ]

    def __init__(
        self,
        model_name_or_path: str,
        task_name: str = "mrpc",
        max_seq_length: int = 128,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        **kwargs,
    ):
        super().__init__()
        self.model_name_or_path = model_name_or_path
        self.task_name = task_name
        self.max_seq_length = max_seq_length
        self.train_batch_size = train_batch_size
        self.eval_batch_size = eval_batch_size

        self.text_fields = self.task_text_field_map[task_name]
        self.num_labels = self.glue_task_num_labels[task_name]
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def setup(self, stage: str):
        self.dataset = datasets.load_dataset("glue", self.task_name)

        for split in self.dataset.keys():
            self.dataset[split] = self.dataset[split].map(
                self.convert_to_features,
                batched=True,
                remove_columns=["label"],
            )
            self.columns = [c for c in self.dataset[split].column_names if c in self.loader_columns]
            self.dataset[split].set_format(type="torch", columns=self.columns)

        self.eval_splits = [x for x in self.dataset.keys() if "validation" in x]

    def prepare_data(self):
        datasets.load_dataset("glue", self.task_name)
        AutoTokenizer.from_pretrained(self.model_name_or_path, use_fast=True)

    def train_dataloader(self):
        return DataLoader(self.dataset["train"], batch_size=self.train_batch_size, shuffle=True)

    def val_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["validation"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def test_dataloader(self):
        if len(self.eval_splits) == 1:
            return DataLoader(self.dataset["test"], batch_size=self.eval_batch_size)
        elif len(self.eval_splits) > 1:
            return [DataLoader(self.dataset[x], batch_size=self.eval_batch_size) for x in self.eval_splits]

    def convert_to_features(self, example_batch, indices=None):
        # Either encode single sentence or sentence pairs
        if len(self.text_fields) > 1:
            texts_or_text_pairs = list(zip(example_batch[self.text_fields[0]], example_batch[self.text_fields[1]]))
        else:
            texts_or_text_pairs = example_batch[self.text_fields[0]]

        # Tokenize the text/text pairs
        features = self.tokenizer.batch_encode_plus(
            texts_or_text_pairs, max_length=self.max_seq_length, pad_to_max_length=True, truncation=True
        )

        # Rename label to labels to make it easier to pass to model forward
        features["labels"] = example_batch["label"]

        return features

In [4]:
class GLUETransformer(LightningModule):
    def __init__(
        self,
        model_name_or_path: str,
        num_labels: int,
        task_name: str,
        learning_rate: float = 2e-5,
        adam_epsilon: float = 1e-8,
        warmup_steps: int = 0,
        weight_decay: float = 0.0,
        train_batch_size: int = 32,
        eval_batch_size: int = 32,
        eval_splits: Optional[list] = None,
        **kwargs,
    ):
        super().__init__()

        self.save_hyperparameters()

        self.config = AutoConfig.from_pretrained(model_name_or_path, num_labels=num_labels)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path, config=self.config)
        self.metric = datasets.load_metric(
            "glue", self.hparams.task_name, experiment_id=datetime.now().strftime("%d-%m-%Y_%H-%M-%S")
        )

        self.training_step_outputs = []
        self.val_step_outputs = []

        self.wandb_name = "learning_rate_1e-5"
        self.epoch = 0

        # wandb.init(
        #     # Set the project where this run will be logged
        #     project="MLOPS",
        #     # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
        #     name=f"{self.wandb_name}_epoch_{self.epoch}",
        #     reinit=True,
        #     # Track hyperparameters and run metadata
        #     config={
        #         "architecture": "DistilBERT",
        #         "dataset": "MRPC",
        #         "learning_rate": self.hparams.learning_rate,
        #         "adam_epsilon": self.hparams.adam_epsilon,
        #         "warmup_steps": self.hparams.warmup_steps,
        #         "weight_decay": self.hparams.weight_decay,
        #         "train_batch_size": self.hparams.train_batch_size,
        #         "eval_batch_size": self.hparams.eval_batch_size
        #     })

        # wandb.log({
        #     "learning_rate": self.hparams.learning_rate,
        #     "adam_epsilon": self.hparams.adam_epsilon,
        #     "warmup_steps": self.hparams.warmup_steps,
        #     "weight_decay": self.hparams.weight_decay,
        #     "train_batch_size": self.hparams.train_batch_size,
        #     "eval_batch_size": self.hparams.eval_batch_size
        # })

    def forward(self, **inputs):
        return self.model(**inputs)

    def training_step(self, batch, batch_idx):
        if self.current_epoch != self.epoch:
            self.epoch = self.current_epoch
            # wandb.init(
            #     project="MLOPS",
            #     name=f"{self.wandb_name}_epoch_{self.epoch}",
            #     config=wandb.config
            # )

            # wandb.log({
            #     "learning_rate": self.hparams.learning_rate,
            #     "adam_epsilon": self.hparams.adam_epsilon,
            #     "warmup_steps": self.hparams.warmup_steps,
            #     "weight_decay": self.hparams.weight_decay,
            #     "train_batch_size": self.hparams.train_batch_size,
            #     "eval_batch_size": self.hparams.eval_batch_size
            # })

        outputs = self(**batch)
        loss = outputs[0]

        metrics = {
            "train/train_loss": loss
        }
        # wandb.log(metrics)

        self.training_step_outputs.append(loss)
        self.log("train/train_loss", loss)

        return loss

    def on_train_epoch_end(self):
        loss = torch.stack(self.training_step_outputs).mean()

        metrics = {
            "train/average_train_loss": loss
        }
        # wandb.log(metrics)
        self.log("train/train_average_loss", loss)

        self.training_step_outputs.clear()

    def validation_step(self, batch, batch_idx, dataloader_idx=0):
        outputs = self(**batch)
        val_loss, logits = outputs[:2]

        if self.hparams.num_labels > 1:
            preds = torch.argmax(logits, axis=1)
        elif self.hparams.num_labels == 1:
            preds = logits.squeeze()

        labels = batch["labels"]

        metrics = {
            "val/val_loss": val_loss
        }
        # wandb.log(metrics)

        self.val_step_outputs.append(val_loss)
        self.log("val/val_loss", val_loss)

        return {"loss": val_loss, "preds": preds, "labels": labels}

    def on_validation_epoch_end(self):
        loss = torch.stack(self.val_step_outputs).mean()

        metrics = {
            "val/average_val_loss": loss
        }
        # wandb.log(metrics)
        self.log("val/val_average_loss", loss)

        self.val_step_outputs.clear()

    def validation_epoch_end(self, outputs):
        if self.hparams.task_name == "mnli":
            for i, output in enumerate(outputs):
                # matched or mismatched
                split = self.hparams.eval_splits[i].split("_")[-1]
                preds = torch.cat([x["preds"] for x in output]).detach().cpu().numpy()
                labels = torch.cat([x["labels"] for x in output]).detach().cpu().numpy()
                loss = torch.stack([x["loss"] for x in output]).mean()
                self.log(f"val_loss_{split}", loss, prog_bar=True)
                split_metrics = {
                    f"{k}_{split}": v for k, v in self.metric.compute(predictions=preds, references=labels).items()
                }
                self.log_dict(split_metrics, prog_bar=True)
            return loss

        preds = torch.cat([x["preds"] for x in outputs]).detach().cpu().numpy()
        labels = torch.cat([x["labels"] for x in outputs]).detach().cpu().numpy()
        loss = torch.stack([x["loss"] for x in outputs]).mean()

        metrics = {
            "val/average_val_loss": loss
        }
        # wandb.log(metrics)

        self.log("val_loss", loss, prog_bar=True)
        self.log_dict(self.metric.compute(predictions=preds, references=labels), prog_bar=True)

    def configure_optimizers(self):
        """Prepare optimizer and schedule (linear warmup and decay)"""
        model = self.model
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]

        optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        # optimizer = torch.optim.SGD(optimizer_grouped_parameters, lr=self.hparams.learning_rate, momentum=1)
        # optimizer = torch.optim.ASGD(optimizer_grouped_parameters, lr=self.hparams.learning_rate, lambd=0.000001)

        # scheduler = get_inverse_sqrt_schedule(
        # scheduler = get_cosine_schedule_with_warmup(
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.hparams.warmup_steps,
            num_training_steps=self.trainer.estimated_stepping_batches,
            # timescale=1
        )
        scheduler = {"scheduler": scheduler, "interval": "step", "frequency": 1}
        return [optimizer], [scheduler]


In [5]:
seed_everything(42)

dm = GLUEDataModule(
    model_name_or_path="distilbert-base-uncased",
    task_name="mrpc",
    # train_batch_size=64,
    # eval_batch_size=64
)
dm.setup("fit")
model = GLUETransformer(
    model_name_or_path="distilbert-base-uncased",
    num_labels=dm.num_labels,
    eval_splits=dm.eval_splits,
    task_name=dm.task_name,
    # learning_rate=1e-5,
    # warmup_steps=75,
    # train_batch_size=64,
    # eval_batch_size=64,
    # weight_decay=1e-5,
    # adam_epsilon=1e-10,
)

trainer = Trainer(
    max_epochs=3,
    accelerator="auto",
    devices=1 if torch.cuda.is_available() else None,
)
# trainer.fit(model, datamodule=dm)

# wandb.finish()

INFO:lightning_fabric.utilities.seed:Global seed set to 42


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/28.8k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/28.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/27.9k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]



Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.metric = datasets.load_metric(


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [6]:
def train_func(config):
    dm2 = GLUEDataModule(
        model_name_or_path="distilbert-base-uncased",
        task_name="mrpc",
        train_batch_size=config["batch_size"],
        eval_batch_size=config["batch_size"]
    )
    dm2.setup("fit")
    model = GLUETransformer(
        model_name_or_path="distilbert-base-uncased",
        num_labels=dm2.num_labels,
        task_name=dm2.task_name,
        learning_rate=config["learning_rate"],
        warmup_steps=config["warmup_steps"],
        train_batch_size=config["batch_size"],
        eval_batch_size=config["batch_size"],
        eval_splits=dm2.eval_splits,
    )

    trainer = pl.Trainer(
        devices="auto",
        accelerator="auto",
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=False,
    )
    trainer = prepare_trainer(trainer)
    trainer.fit(model, datamodule=dm2)

In [8]:
num_epochs = 3
num_samples = 10

search_space = {
    "learning_rate": tune.loguniform(1e-6, 1e-1),
    "batch_size": tune.choice([4, 8, 16]),
    "warmup_steps": tune.randint(0, 251)
}

scaling_config = ScalingConfig(
    num_workers=1, use_gpu=True, resources_per_worker={"CPU": 1, "GPU": 1}
)

run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="val_loss",
        checkpoint_score_order="min",
    ),
)

ray_trainer = TorchTrainer(
    train_func,
    run_config=run_config,
    scaling_config=scaling_config,
)

In [9]:
def tune_mnist_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": search_space},
        tune_config=tune.TuneConfig(
            metric="val_loss",
            mode="min",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    # RAY_memory_monitor_refresh_ms=0
    return tuner.fit()

results = tune_mnist_asha(num_samples=num_samples)

2023-10-17 11:42:37,837	INFO worker.py:1642 -- Started a local Ray instance.
2023-10-17 11:42:40,978	INFO tune.py:228 -- Initializing Ray automatically. For cluster usage or custom Ray initialization, call `ray.init(...)` before `Tuner(...)`.
2023-10-17 11:42:40,990	INFO tune.py:654 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+---------------------------------------------------------------------+
| Configuration for experiment     TorchTrainer_2023-10-17_11-42-32   |
+---------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator              |
| Scheduler                        AsyncHyperBandScheduler            |
| Number of trials                 25                                 |
+---------------------------------------------------------------------+

View detailed results here: /root/ray_results/TorchTrainer_2023-10-17_11-42-32
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/TorchTrainer_2023-10-17_11-42-32`

Trial status: 16 PENDING
Current time: 2023-10-17 11:42:41. Total running time: 0s
Logical resource usage: 0/2 CPUs, 0/1 GPUs (0.0/1.0 accelerator_type:None)
+----------------------------------------------------------------------------------------------------------------+
| Trial name       

[2m[36m(TorchTrainer pid=1684)[0m Starting distributed worker processes: ['1807 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=1807)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map:  27%|██▋       | 1000/3668 [00:00<00:01, 2465.90 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 2641.84 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2559.83 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2545.65 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2441.99 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2502.67 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2516.96 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2461.57 examples/s]
[2m[36m(RayTrainWorker pid=1807)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and ar


Trial status: 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:43:11. Total running time: 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
+----------------------------------------------------------------------------------------------------------------+
| Trial name                 status       ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps |
+----------------------------------------------------------------------------------------------------------------+
| TorchTrainer_4753d_00000   RUNNING               7.45934e-05                        4                       14 |
| TorchTrainer_4753d_00001   PENDING               0.000984674                       16                      121 |
| TorchTrainer_4753d_00002   PENDING               1.95172e-06                        4                       99 |
| TorchTrainer_4753d_00003   PENDING               0.00179656                         4                        1 |
| TorchTrainer_4753

[2m[36m(RayTrainWorker pid=1807)[0m [rank: 0] Global seed set to 42
[2m[36m(RayTrainWorker pid=1807)[0m Missing logger folder: /root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00000_0_batch_size=4,learning_rate=0.0001,warmup_steps=14_2023-10-17_11-42-41/lightning_logs
Map:  27%|██▋       | 1000/3668 [00:00<00:00, 4852.25 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 4890.60 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 4916.50 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 4810.43 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 4379.39 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4848.78 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4803.73 examples/s]
[2m[36m(RayTrainWorker pid=1807)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=1807)[0m Loading `train_data

Trial status: 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:43:41. Total running time: 1min 0s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
+----------------------------------------------------------------------------------------------------------------+
| Trial name                 status       ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps |
+----------------------------------------------------------------------------------------------------------------+
| TorchTrainer_4753d_00000   RUNNING               7.45934e-05                        4                       14 |
| TorchTrainer_4753d_00001   PENDING               0.000984674                       16                      121 |
| TorchTrainer_4753d_00002   PENDING               1.95172e-06                        4                       99 |
| TorchTrainer_4753d_00003   PENDING               0.00179656                         4                        1 |
| TorchTrainer_4

[2m[36m(RayTrainWorker pid=1807)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00000_0_batch_size=4,learning_rate=0.0001,warmup_steps=14_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:45:12. Total running time: 2min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5556000471115112 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status       ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrainer_4753d_00

[2m[36m(RayTrainWorker pid=1807)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00000_0_batch_size=4,learning_rate=0.0001,warmup_steps=14_2023-10-17_11-42-41/checkpoint_000001)


Trial status: 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:46:12. Total running time: 3min 30s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5041380524635315 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status       ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrainer_4753d_00

[2m[36m(RayTrainWorker pid=1807)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00000_0_batch_size=4,learning_rate=0.0001,warmup_steps=14_2023-10-17_11-42-41/checkpoint_000002)



Trial TorchTrainer_4753d_00000 completed after 3 iterations at 2023-10-17 11:47:26. Total running time: 4min 45s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00000 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000002 |
| time_this_iter_s                                   74.86369 |
| time_total_s                                      268.27617 |
| training_iteration                                        3 |
| accuracy                                            0.72794 |
| epoch                                                     2 |
| f1                                                  0.79252 |
| step                                                   2751 |
| train/train_average_loss                            0.43009 |
| train/train_loss                                    0.18407 |
| val/val_average_loss                                

[2m[36m(TorchTrainer pid=2994)[0m Starting distributed worker processes: ['3058 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=3058)[0m Setting up process group for: env:// [rank=0, world_size=1]



Trial status: 1 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:47:42. Total running time: 5min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5413188934326172 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 

Map:  27%|██▋       | 1000/3668 [00:00<00:01, 2172.56 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 2176.97 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2345.48 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2308.59 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2443.66 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2479.74 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2445.53 examples/s]
[2m[36m(RayTrainWorker pid=3058)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
[2m[36m(RayTrainWorker pid=3058)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m

Trial status: 1 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:48:12. Total running time: 5min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5413188934326172 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 

[2m[36m(RayTrainWorker pid=3058)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00001_1_batch_size=16,learning_rate=0.0010,warmup_steps=121_2023-10-17_11-42-41/checkpoint_000000)



Trial TorchTrainer_4753d_00002 started with configuration:
+-------------------------------------------------------+
| Trial TorchTrainer_4753d_00002 config                 |
+-------------------------------------------------------+
| train_loop_config/batch_size                        4 |
| train_loop_config/learning_rate           1.95172e-06 |
| train_loop_config/warmup_steps                     99 |
+-------------------------------------------------------+


[2m[36m(TorchTrainer pid=3455)[0m Starting distributed worker processes: ['3519 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=3519)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map: 100%|██████████| 408/408 [00:00<00:00, 4049.49 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4789.48 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4630.21 examples/s]
[2m[36m(RayTrainWorker pid=3519)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
[2m[36m(RayTrainWorker pid=3519)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Trial status: 2 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:49:12. Total running time: 6min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5413188934326172 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|

[2m[36m(RayTrainWorker pid=3519)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=3519)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=3519)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=3519)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=3519)[0m   rank_zero_warn(
[2m[36m(RayTrainWorker pid=3519)[0m [rank: 0] Global seed set to 42
[2m[36m(RayTrainWorker pid=3519)[0m Missing logger folder: /root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00002_2_batch_size=4,learning_rate=0.0000,warmup_steps=99_2023-10-17_11-42-41/lightning_logs
Map:   0%|          | 0/3668 [00:00<?, ? examples/s]
Map:  27%|██▋       | 1000/3668 [00:00<00:00, 2772.30 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 3322.85 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 3815.78 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 3675.99 examples/s]
Map:   0%|          | 

Trial status: 2 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:49:42. Total running time: 7min 1s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5413188934326172 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| T

[2m[36m(RayTrainWorker pid=3519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00002_2_batch_size=4,learning_rate=0.0000,warmup_steps=99_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 2 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:51:12. Total running time: 8min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00000 with val_loss=0.5413188934326172 and params={'train_loop_config': {'learning_rate': 7.45934328572655e-05, 'batch_size': 4, 'warmup_steps': 14}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 

[2m[36m(RayTrainWorker pid=3519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00002_2_batch_size=4,learning_rate=0.0000,warmup_steps=99_2023-10-17_11-42-41/checkpoint_000001)


Trial status: 2 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:52:12. Total running time: 9min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.46388867497444153 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=3519)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00002_2_batch_size=4,learning_rate=0.0000,warmup_steps=99_2023-10-17_11-42-41/checkpoint_000002)



Trial TorchTrainer_4753d_00002 completed after 3 iterations at 2023-10-17 11:53:33. Total running time: 10min 51s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00002 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000002 |
| time_this_iter_s                                   89.27856 |
| time_total_s                                      268.94711 |
| training_iteration                                        3 |
| accuracy                                            0.82843 |
| epoch                                                     2 |
| f1                                                  0.88255 |
| step                                                   2751 |
| train/train_average_loss                            0.52208 |
| train/train_loss                                    0.52633 |
| val/val_average_loss                               

[2m[36m(TorchTrainer pid=4718)[0m Starting distributed worker processes: ['4786 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=4786)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4407.73 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4316.20 examples/s]
[2m[36m(RayTrainWorker pid=4786)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
[2m[36m(RayTrainWorker pid=4786)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=4786)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=4786)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=4786)[0m IPU available: False, using: 0 IPUs
[2m[


Trial status: 3 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:54:13. Total running time: 11min 31s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=4786)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00003_3_batch_size=4,learning_rate=0.0018,warmup_steps=1_2023-10-17_11-42-41/checkpoint_000000)



Trial TorchTrainer_4753d_00004 started with configuration:
+-----------------------------------------------------+
| Trial TorchTrainer_4753d_00004 config               |
+-----------------------------------------------------+
| train_loop_config/batch_size                      8 |
| train_loop_config/learning_rate           0.0492905 |
| train_loop_config/warmup_steps                  191 |
+-----------------------------------------------------+


[2m[36m(TorchTrainer pid=5245)[0m Starting distributed worker processes: ['5301 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=5301)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map:  27%|██▋       | 1000/3668 [00:00<00:01, 2333.48 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 2743.73 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2724.92 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2663.44 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2289.44 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 3101.21 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2776.36 examples/s]
[2m[36m(RayTrainWorker pid=5301)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'c


Trial status: 4 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:55:43. Total running time: 13min 2s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Map:  27%|██▋       | 1000/3668 [00:00<00:00, 4758.63 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 4727.95 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 4759.28 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 4679.28 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 4545.70 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 5219.08 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4891.08 examples/s]
[2m[36m(RayTrainWorker pid=5301)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=5301)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=5301)[0m 
[2m[36m(RayTrainWorker pid=5301)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=5301)[0m --------------------------------------------------------------
[2m[36m(

Trial status: 4 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:56:13. Total running time: 13min 32s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=5301)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00004_4_batch_size=8,learning_rate=0.0493,warmup_steps=191_2023-10-17_11-42-41/checkpoint_000000)



Trial TorchTrainer_4753d_00005 started with configuration:
+------------------------------------------------------+
| Trial TorchTrainer_4753d_00005 config                |
+------------------------------------------------------+
| train_loop_config/batch_size                       8 |
| train_loop_config/learning_rate           0.00122295 |
| train_loop_config/warmup_steps                    21 |
+------------------------------------------------------+


[2m[36m(TorchTrainer pid=5720)[0m Starting distributed worker processes: ['5780 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=5780)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map: 100%|██████████| 408/408 [00:00<00:00, 2026.13 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2652.22 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2565.34 examples/s]
[2m[36m(RayTrainWorker pid=5780)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
[2m[36m(RayTrainWorker pid=5780)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=5780)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=5780)[0m TPU a


Trial status: 5 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:57:13. Total running time: 14min 32s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Map:  27%|██▋       | 1000/3668 [00:00<00:00, 4853.83 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 4796.33 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 4841.01 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 4729.86 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 4612.78 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2436.89 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2364.34 examples/s]
[2m[36m(RayTrainWorker pid=5780)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=5780)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=5780)[0m 
[2m[36m(RayTrainWorker pid=5780)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=5780)[0m --------------------------------------------------------------
[2m[36m(

Trial status: 5 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:57:43. Total running time: 15min 2s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


[2m[36m(RayTrainWorker pid=5780)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00005_5_batch_size=8,learning_rate=0.0012,warmup_steps=21_2023-10-17_11-42-41/checkpoint_000000)



Trial TorchTrainer_4753d_00005 completed after 1 iterations at 2023-10-17 11:58:22. Total running time: 15min 40s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00005 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000000 |
| time_this_iter_s                                    84.0109 |
| time_total_s                                        84.0109 |
| training_iteration                                        1 |
| accuracy                                            0.68382 |
| epoch                                                     0 |
| f1                                                  0.81223 |
| step                                                    459 |
| train/train_loss                                    0.29225 |
| val/val_average_loss                                0.64492 |
| val/val_loss                                       

[2m[36m(TorchTrainer pid=6191)[0m Starting distributed worker processes: ['6247 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=6247)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map: 100%|██████████| 408/408 [00:00<00:00, 2214.73 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2320.40 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2480.25 examples/s]
[2m[36m(RayTrainWorker pid=6247)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
[2m[36m(RayTrainWorker pid=6247)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=6247)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=6247)[0m TPU a


Trial status: 6 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:58:43. Total running time: 16min 2s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Map:  27%|██▋       | 1000/3668 [00:00<00:00, 3583.25 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 4133.68 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 3996.77 examples/s]
Map: 100%|██████████| 3668/3668 [00:00<00:00, 4032.55 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 4748.40 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4790.75 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4711.65 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4669.83 examples/s]
[2m[36m(RayTrainWorker pid=6247)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=6247)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=6247)[0m 
[2m[36m(RayTrainWorker pid=6247)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=6247)[0m -------

Trial status: 6 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 11:59:13. Total running time: 16min 32s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=6247)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00006_6_batch_size=16,learning_rate=0.0000,warmup_steps=58_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 6 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 12:00:14. Total running time: 17min 32s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=6247)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00006_6_batch_size=16,learning_rate=0.0000,warmup_steps=58_2023-10-17_11-42-41/checkpoint_000001)



Trial status: 7 TERMINATED | 16 PENDING
Current time: 2023-10-17 12:00:44. Total running time: 18min 2s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrai

[2m[36m(TorchTrainer pid=6866)[0m Starting distributed worker processes: ['6934 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=6934)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4595.65 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4645.15 examples/s]
[2m[36m(RayTrainWorker pid=6934)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
[2m[36m(RayTrainWorker pid=6934)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=6934)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=6934)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=6934)[0m IPU available: False, using: 0 IPUs
[2m[


Trial status: 7 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 12:01:14. Total running time: 18min 32s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



Trial status: 7 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 12:01:44. Total running time: 19min 2s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


[2m[36m(RayTrainWorker pid=6934)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00007_7_batch_size=16,learning_rate=0.0000,warmup_steps=189_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 7 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 12:02:14. Total running time: 19min 32s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+



Trial status: 7 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 12:02:44. Total running time: 20min 2s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


[2m[36m(RayTrainWorker pid=6934)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00007_7_batch_size=16,learning_rate=0.0000,warmup_steps=189_2023-10-17_11-42-41/checkpoint_000001)



Trial status: 8 TERMINATED | 16 PENDING
Current time: 2023-10-17 12:03:14. Total running time: 20min 33s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTra

[2m[36m(TorchTrainer pid=7571)[0m Starting distributed worker processes: ['7625 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=7625)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=7625)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias']
[2m[36m(RayTrainWorker pid=7625)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=7625)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=7625)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=7625)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=7625)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=7625)[0m   rank_zero_warn(
[2m[


Trial status: 8 TERMINATED | 1 RUNNING | 16 PENDING
Current time: 2023-10-17 12:03:44. Total running time: 21min 3s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=7625)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00008_8_batch_size=16,learning_rate=0.0012,warmup_steps=243_2023-10-17_11-42-41/checkpoint_000000)



Trial TorchTrainer_4753d_00009 started with configuration:
+-------------------------------------------------------+
| Trial TorchTrainer_4753d_00009 config                 |
+-------------------------------------------------------+
| train_loop_config/batch_size                       16 |
| train_loop_config/learning_rate           1.70707e-06 |
| train_loop_config/warmup_steps                    134 |
+-------------------------------------------------------+

Trial status: 9 TERMINATED | 1 RUNNING | 15 PENDING
Current time: 2023-10-17 12:04:44. Total running time: 22min 3s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+---------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(TorchTrainer pid=8024)[0m Starting distributed worker processes: ['8080 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=8080)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=8080)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=8080)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=8080)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=8080)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=8080)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=8080)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=8080)[0m   rank_zero_warn(
[2m[

Trial status: 9 TERMINATED | 1 RUNNING | 15 PENDING
Current time: 2023-10-17 12:05:14. Total running time: 22min 33s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=8080)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00009_9_batch_size=16,learning_rate=0.0000,warmup_steps=134_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 9 TERMINATED | 1 RUNNING | 15 PENDING
Current time: 2023-10-17 12:06:14. Total running time: 23min 33s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=8080)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00009_9_batch_size=16,learning_rate=0.0000,warmup_steps=134_2023-10-17_11-42-41/checkpoint_000001)



Trial TorchTrainer_4753d_00009 completed after 2 iterations at 2023-10-17 12:06:52. Total running time: 24min 10s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00009 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000001 |
| time_this_iter_s                                   54.43804 |
| time_total_s                                      128.70653 |
| training_iteration                                        2 |
| accuracy                                            0.71078 |
| epoch                                                     1 |
| f1                                                  0.82388 |
| step                                                    460 |
| train/train_average_loss                            0.68397 |
| train/train_loss                                    0.38617 |
| val/val_average_loss                               

[2m[36m(TorchTrainer pid=8689)[0m Starting distributed worker processes: ['8745 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=8745)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=8745)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
[2m[36m(RayTrainWorker pid=8745)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=8745)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=8745)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=8745)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=8745)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=8745)[0m   rank_zero_warn(
[2m[


Trial status: 10 TERMINATED | 1 RUNNING | 14 PENDING
Current time: 2023-10-17 12:07:14. Total running time: 24min 33s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Map:  27%|██▋       | 1000/3668 [00:00<00:00, 4582.22 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 4433.00 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:00<00:00, 3378.96 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 3255.05 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2191.43 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2669.47 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2527.97 examples/s]
[2m[36m(RayTrainWorker pid=8745)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=8745)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=8745)[0m 
[2m[36m(RayTrainWorker pid=8745)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=8745)[0m --------------------------------------------------------------
[2m[36m(

Trial status: 10 TERMINATED | 1 RUNNING | 14 PENDING
Current time: 2023-10-17 12:07:45. Total running time: 25min 3s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=8745)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00010_10_batch_size=4,learning_rate=0.0000,warmup_steps=59_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 10 TERMINATED | 1 RUNNING | 14 PENDING
Current time: 2023-10-17 12:08:45. Total running time: 26min 3s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=8745)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00010_10_batch_size=4,learning_rate=0.0000,warmup_steps=59_2023-10-17_11-42-41/checkpoint_000001)


Trial status: 10 TERMINATED | 1 RUNNING | 14 PENDING
Current time: 2023-10-17 12:10:15. Total running time: 27min 33s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=8745)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00010_10_batch_size=4,learning_rate=0.0000,warmup_steps=59_2023-10-17_11-42-41/checkpoint_000002)



Trial status: 11 TERMINATED | 14 PENDING
Current time: 2023-10-17 12:11:15. Total running time: 28min 34s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTr

[2m[36m(TorchTrainer pid=9876)[0m Starting distributed worker processes: ['9934 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=9934)[0m Setting up process group for: env:// [rank=0, world_size=1]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4345.79 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4495.06 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4414.57 examples/s]
[2m[36m(RayTrainWorker pid=9934)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
[2m[36m(RayTrainWorker pid=9934)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=9934)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=9934)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTr


Trial status: 11 TERMINATED | 1 RUNNING | 13 PENDING
Current time: 2023-10-17 12:11:45. Total running time: 29min 4s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



Trial status: 11 TERMINATED | 1 RUNNING | 13 PENDING
Current time: 2023-10-17 12:12:15. Total running time: 29min 34s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=9934)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00011_11_batch_size=8,learning_rate=0.0001,warmup_steps=52_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 11 TERMINATED | 1 RUNNING | 13 PENDING
Current time: 2023-10-17 12:13:15. Total running time: 30min 34s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=9934)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00011_11_batch_size=8,learning_rate=0.0001,warmup_steps=52_2023-10-17_11-42-41/checkpoint_000001)


Trial status: 11 TERMINATED | 1 RUNNING | 13 PENDING
Current time: 2023-10-17 12:14:15. Total running time: 31min 34s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=9934)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00011_11_batch_size=8,learning_rate=0.0001,warmup_steps=52_2023-10-17_11-42-41/checkpoint_000002)



Trial TorchTrainer_4753d_00012 started with configuration:
+-------------------------------------------------------+
| Trial TorchTrainer_4753d_00012 config                 |
+-------------------------------------------------------+
| train_loop_config/batch_size                       16 |
| train_loop_config/learning_rate           1.60372e-05 |
| train_loop_config/warmup_steps                    171 |
+-------------------------------------------------------+


[2m[36m(TorchTrainer pid=10913)[0m Starting distributed worker processes: ['10973 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=10973)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=10973)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier.weight', 'pre_classifier.bias']
[2m[36m(RayTrainWorker pid=10973)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=10973)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=10973)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=10973)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=10973)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=10973)[0m   rank_zero_wa


Trial status: 12 TERMINATED | 1 RUNNING | 12 PENDING
Current time: 2023-10-17 12:15:15. Total running time: 32min 34s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=10973)[0m [rank: 0] Global seed set to 42
[2m[36m(RayTrainWorker pid=10973)[0m Missing logger folder: /root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00012_12_batch_size=16,learning_rate=0.0000,warmup_steps=171_2023-10-17_11-42-41/lightning_logs
Map:  27%|██▋       | 1000/3668 [00:00<00:01, 2205.07 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 2555.00 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2518.00 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2473.49 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2342.07 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4689.69 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4521.94 examples/s]
[2m[36m(RayTrainWorker pid=10973)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=10973)[0m Loading `tra

Trial status: 12 TERMINATED | 1 RUNNING | 12 PENDING
Current time: 2023-10-17 12:15:46. Total running time: 33min 4s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=10973)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00012_12_batch_size=16,learning_rate=0.0000,warmup_steps=171_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 12 TERMINATED | 1 RUNNING | 12 PENDING
Current time: 2023-10-17 12:16:46. Total running time: 34min 4s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00002 with val_loss=0.4060378968715668 and params={'train_loop_config': {'learning_rate': 1.9517224641449515e-06, 'batch_size': 4, 'warmup_steps': 99}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=10973)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00012_12_batch_size=16,learning_rate=0.0000,warmup_steps=171_2023-10-17_11-42-41/checkpoint_000001)


Trial status: 12 TERMINATED | 1 RUNNING | 12 PENDING
Current time: 2023-10-17 12:17:16. Total running time: 34min 34s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.35121408104896545 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=10973)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00012_12_batch_size=16,learning_rate=0.0000,warmup_steps=171_2023-10-17_11-42-41/checkpoint_000002)



Trial TorchTrainer_4753d_00012 completed after 3 iterations at 2023-10-17 12:18:10. Total running time: 35min 29s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00012 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000002 |
| time_this_iter_s                                   54.61322 |
| time_total_s                                       185.1326 |
| training_iteration                                        3 |
| accuracy                                            0.85294 |
| epoch                                                     2 |
| f1                                                  0.89691 |
| step                                                    690 |
| train/train_average_loss                            0.41602 |
| train/train_loss                                     0.2791 |
| val/val_average_loss                               

[2m[36m(TorchTrainer pid=11814)[0m Starting distributed worker processes: ['11874 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=11874)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=11874)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
[2m[36m(RayTrainWorker pid=11874)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=11874)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=11874)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=11874)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=11874)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=11874)[0m   rank_zero_wa


Trial status: 13 TERMINATED | 1 RUNNING | 11 PENDING
Current time: 2023-10-17 12:18:46. Total running time: 36min 4s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=11874)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00013_13_batch_size=8,learning_rate=0.0000,warmup_steps=80_2023-10-17_11-42-41/checkpoint_000000)


Trial status: 13 TERMINATED | 1 RUNNING | 11 PENDING
Current time: 2023-10-17 12:19:46. Total running time: 37min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=11874)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00013_13_batch_size=8,learning_rate=0.0000,warmup_steps=80_2023-10-17_11-42-41/checkpoint_000001)



Trial TorchTrainer_4753d_00013 completed after 2 iterations at 2023-10-17 12:20:43. Total running time: 38min 2s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00013 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000001 |
| time_this_iter_s                                   64.47971 |
| time_total_s                                      144.83306 |
| training_iteration                                        2 |
| accuracy                                            0.70833 |
| epoch                                                     1 |
| f1                                                  0.82265 |
| step                                                    918 |
| train/train_average_loss                            0.63339 |
| train/train_loss                                    0.41197 |
| val/val_average_loss                                

[2m[36m(TorchTrainer pid=12549)[0m Starting distributed worker processes: ['12603 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=12603)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=12603)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
[2m[36m(RayTrainWorker pid=12603)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=12603)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=12603)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=12603)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=12603)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=12603)[0m   rank_zero_wa


Trial status: 14 TERMINATED | 1 RUNNING | 10 PENDING
Current time: 2023-10-17 12:21:16. Total running time: 38min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



Trial status: 14 TERMINATED | 1 RUNNING | 10 PENDING
Current time: 2023-10-17 12:21:46. Total running time: 39min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=12603)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00014_14_batch_size=8,learning_rate=0.0021,warmup_steps=133_2023-10-17_11-42-41/checkpoint_000000)



Trial TorchTrainer_4753d_00015 started with configuration:
+-------------------------------------------------------+
| Trial TorchTrainer_4753d_00015 config                 |
+-------------------------------------------------------+
| train_loop_config/batch_size                        4 |
| train_loop_config/learning_rate           0.000689448 |
| train_loop_config/warmup_steps                    190 |
+-------------------------------------------------------+


[2m[36m(TorchTrainer pid=13072)[0m Starting distributed worker processes: ['13136 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=13136)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=13136)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
[2m[36m(RayTrainWorker pid=13136)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=13136)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=13136)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=13136)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=13136)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=13136)[0m   rank_zero_wa


Trial status: 15 TERMINATED | 1 RUNNING | 9 PENDING
Current time: 2023-10-17 12:22:46. Total running time: 40min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=13136)[0m [rank: 0] Global seed set to 42
[2m[36m(RayTrainWorker pid=13136)[0m Missing logger folder: /root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00015_15_batch_size=4,learning_rate=0.0007,warmup_steps=190_2023-10-17_11-42-41/lightning_logs
Map:  27%|██▋       | 1000/3668 [00:00<00:01, 2256.82 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 2430.29 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2311.52 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2355.83 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2573.46 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4873.13 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4705.05 examples/s]
[2m[36m(RayTrainWorker pid=13136)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=13136)[0m Loading `trai

Trial status: 15 TERMINATED | 1 RUNNING | 9 PENDING
Current time: 2023-10-17 12:23:16. Total running time: 40min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=13136)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00015_15_batch_size=4,learning_rate=0.0007,warmup_steps=190_2023-10-17_11-42-41/checkpoint_000000)



Trial status: 16 TERMINATED | 9 PENDING
Current time: 2023-10-17 12:24:17. Total running time: 41min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchT

[2m[36m(TorchTrainer pid=13597)[0m Starting distributed worker processes: ['13651 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=13651)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=13651)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=13651)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=13651)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=13651)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=13651)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=13651)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=13651)[0m   rank_zero_wa


Trial status: 16 TERMINATED | 1 RUNNING | 8 PENDING
Current time: 2023-10-17 12:24:47. Total running time: 42min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=13651)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00016_16_batch_size=8,learning_rate=0.0002,warmup_steps=189_2023-10-17_11-42-58/checkpoint_000000)



Trial status: 17 TERMINATED | 8 PENDING
Current time: 2023-10-17 12:25:47. Total running time: 43min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTr

[2m[36m(TorchTrainer pid=14076)[0m Starting distributed worker processes: ['14130 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=14130)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=14130)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=14130)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=14130)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=14130)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=14130)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=14130)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=14130)[0m   rank_zero_wa


Trial status: 17 TERMINATED | 1 RUNNING | 7 PENDING
Current time: 2023-10-17 12:26:17. Total running time: 43min 35s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



Trial status: 17 TERMINATED | 1 RUNNING | 7 PENDING
Current time: 2023-10-17 12:26:47. Total running time: 44min 5s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=14130)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00017_17_batch_size=16,learning_rate=0.0043,warmup_steps=199_2023-10-17_11-47-35/checkpoint_000000)



Trial status: 18 TERMINATED | 7 PENDING
Current time: 2023-10-17 12:27:17. Total running time: 44min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchT

[2m[36m(TorchTrainer pid=14521)[0m Starting distributed worker processes: ['14585 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=14585)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=14585)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=14585)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=14585)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=14585)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=14585)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=14585)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=14585)[0m   rank_zero_wa


Trial status: 18 TERMINATED | 1 RUNNING | 6 PENDING
Current time: 2023-10-17 12:27:47. Total running time: 45min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=14585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00018_18_batch_size=8,learning_rate=0.0004,warmup_steps=39_2023-10-17_11-49-04/checkpoint_000000)



Trial status: 19 TERMINATED | 6 PENDING
Current time: 2023-10-17 12:28:47. Total running time: 46min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTr

[2m[36m(TorchTrainer pid=14984)[0m Starting distributed worker processes: ['15042 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=15042)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=15042)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=15042)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=15042)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=15042)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=15042)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=15042)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=15042)[0m   rank_zero_wa


Trial status: 19 TERMINATED | 1 RUNNING | 5 PENDING
Current time: 2023-10-17 12:29:17. Total running time: 46min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=15042)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00019_19_batch_size=16,learning_rate=0.0055,warmup_steps=52_2023-10-17_11-53-43/checkpoint_000000)



Trial TorchTrainer_4753d_00020 started with configuration:
+-----------------------------------------------------+
| Trial TorchTrainer_4753d_00020 config               |
+-----------------------------------------------------+
| train_loop_config/batch_size                      4 |
| train_loop_config/learning_rate           0.0670311 |
| train_loop_config/warmup_steps                   40 |
+-----------------------------------------------------+


[2m[36m(TorchTrainer pid=15417)[0m Starting distributed worker processes: ['15475 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=15475)[0m Setting up process group for: env:// [rank=0, world_size=1]



Trial status: 20 TERMINATED | 1 RUNNING | 4 PENDING
Current time: 2023-10-17 12:30:17. Total running time: 47min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=15475)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=15475)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=15475)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=15475)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=15475)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=15475)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=15475)[0m   rank_zero_warn(
[2m[36m(RayTrainWorker pid=15475)[0m [rank: 0] Global seed set to 42
[2m[36m(RayTrainWorker pid=15475)[0m Missing logger folder: /root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTraine

Trial status: 20 TERMINATED | 1 RUNNING | 4 PENDING
Current time: 2023-10-17 12:30:47. Total running time: 48min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=15475)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00020_20_batch_size=4,learning_rate=0.0670,warmup_steps=40_2023-10-17_11-55-25/checkpoint_000000)



Trial status: 21 TERMINATED | 4 PENDING
Current time: 2023-10-17 12:31:47. Total running time: 49min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTr

[2m[36m(TorchTrainer pid=15924)[0m Starting distributed worker processes: ['15976 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=15976)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=15976)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight', 'classifier.weight']
[2m[36m(RayTrainWorker pid=15976)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=15976)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=15976)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=15976)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=15976)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=15976)[0m   rank_zero_wa


Trial status: 21 TERMINATED | 1 RUNNING | 3 PENDING
Current time: 2023-10-17 12:32:17. Total running time: 49min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



Trial status: 21 TERMINATED | 1 RUNNING | 3 PENDING
Current time: 2023-10-17 12:32:48. Total running time: 50min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=15976)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00021_21_batch_size=4,learning_rate=0.0000,warmup_steps=70_2023-10-17_11-56-58/checkpoint_000000)


Trial status: 21 TERMINATED | 1 RUNNING | 3 PENDING
Current time: 2023-10-17 12:33:48. Total running time: 51min 6s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00012 with val_loss=0.3649887442588806 and params={'train_loop_config': {'learning_rate': 1.6037155126499187e-05, 'batch_size': 16, 'warmup_steps': 171}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

[2m[36m(RayTrainWorker pid=15976)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00021_21_batch_size=4,learning_rate=0.0000,warmup_steps=70_2023-10-17_11-56-58/checkpoint_000001)


Trial status: 21 TERMINATED | 1 RUNNING | 3 PENDING
Current time: 2023-10-17 12:35:18. Total running time: 52min 36s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.35177135467529297 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

[2m[36m(RayTrainWorker pid=15976)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00021_21_batch_size=4,learning_rate=0.0000,warmup_steps=70_2023-10-17_11-56-58/checkpoint_000002)



Trial TorchTrainer_4753d_00021 completed after 3 iterations at 2023-10-17 12:36:22. Total running time: 53min 40s
+-------------------------------------------------------------+
| Trial TorchTrainer_4753d_00021 result                       |
+-------------------------------------------------------------+
| checkpoint_dir_name                       checkpoint_000002 |
| time_this_iter_s                                   80.69672 |
| time_total_s                                      272.66211 |
| training_iteration                                        3 |
| accuracy                                            0.86029 |
| epoch                                                     2 |
| f1                                                  0.90121 |
| step                                                   2751 |
| train/train_average_loss                            0.36367 |
| train/train_loss                                    0.09458 |
| val/val_average_loss                               

[2m[36m(TorchTrainer pid=17187)[0m Starting distributed worker processes: ['17247 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=17247)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=17247)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.bias', 'classifier.bias', 'pre_classifier.weight']
[2m[36m(RayTrainWorker pid=17247)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=17247)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=17247)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=17247)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=17247)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=17247)[0m   rank_zero_wa


Trial status: 22 TERMINATED | 1 RUNNING | 2 PENDING
Current time: 2023-10-17 12:36:48. Total running time: 54min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2411.06 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2342.70 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2464.25 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2545.88 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2404.97 examples/s]
[2m[36m(RayTrainWorker pid=17247)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=17247)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=17247)[0m 
[2m[36m(RayTrainWorker pid=17247)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=17247)[0m --------------------------------------------------------------
[2m[36m(RayTrainWorker pid=17247)[0m 0 | model | DistilBertForSequenceClassification | 67.0 M
[2m[36m(RayTrainWorker pid=17247)[0m 

Trial status: 22 TERMINATED | 1 RUNNING | 2 PENDING
Current time: 2023-10-17 12:37:18. Total running time: 54min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+


[2m[36m(RayTrainWorker pid=17247)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00022_22_batch_size=16,learning_rate=0.0001,warmup_steps=138_2023-10-17_11-58-28/checkpoint_000000)



Trial TorchTrainer_4753d_00023 started with configuration:
+------------------------------------------------------+
| Trial TorchTrainer_4753d_00023 config                |
+------------------------------------------------------+
| train_loop_config/batch_size                      16 |
| train_loop_config/learning_rate           0.00360338 |
| train_loop_config/warmup_steps                    32 |
+------------------------------------------------------+


[2m[36m(TorchTrainer pid=17670)[0m Starting distributed worker processes: ['17736 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=17736)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=17736)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
[2m[36m(RayTrainWorker pid=17736)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=17736)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=17736)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=17736)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=17736)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=17736)[0m   rank_zero_wa


Trial status: 23 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2023-10-17 12:38:18. Total running time: 55min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

Map:  27%|██▋       | 1000/3668 [00:00<00:01, 2488.64 examples/s]
Map:  55%|█████▍    | 2000/3668 [00:00<00:00, 2498.36 examples/s]
Map:  82%|████████▏ | 3000/3668 [00:01<00:00, 2660.15 examples/s]
Map: 100%|██████████| 3668/3668 [00:01<00:00, 2568.83 examples/s]
Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 2301.55 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 4448.17 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 4464.13 examples/s]
[2m[36m(RayTrainWorker pid=17736)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=17736)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=17736)[0m 
[2m[36m(RayTrainWorker pid=17736)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=17736)[0m --------------------------------------------------------------
[2m

Trial status: 23 TERMINATED | 1 RUNNING | 1 PENDING
Current time: 2023-10-17 12:38:49. Total running time: 56min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|

[2m[36m(RayTrainWorker pid=17736)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00023_23_batch_size=16,learning_rate=0.0036,warmup_steps=32_2023-10-17_12-00-51/checkpoint_000000)



Trial TorchTrainer_4753d_00024 started with configuration:
+----------------------------------------------------+
| Trial TorchTrainer_4753d_00024 config              |
+----------------------------------------------------+
| train_loop_config/batch_size                     8 |
| train_loop_config/learning_rate           0.042806 |
| train_loop_config/warmup_steps                  40 |
+----------------------------------------------------+


[2m[36m(TorchTrainer pid=18122)[0m Starting distributed worker processes: ['18176 (172.28.0.12)']
[2m[36m(RayTrainWorker pid=18176)[0m Setting up process group for: env:// [rank=0, world_size=1]
[2m[36m(RayTrainWorker pid=18176)[0m Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'classifier.bias', 'pre_classifier.bias']
[2m[36m(RayTrainWorker pid=18176)[0m You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[2m[36m(RayTrainWorker pid=18176)[0m GPU available: True (cuda), used: True
[2m[36m(RayTrainWorker pid=18176)[0m TPU available: False, using: 0 TPU cores
[2m[36m(RayTrainWorker pid=18176)[0m IPU available: False, using: 0 IPUs
[2m[36m(RayTrainWorker pid=18176)[0m HPU available: False, using: 0 HPUs
[2m[36m(RayTrainWorker pid=18176)[0m   rank_zero_wa


Trial status: 24 TERMINATED | 1 RUNNING
Current time: 2023-10-17 12:39:49. Total running time: 57min 7s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrain

[2m[36m(RayTrainWorker pid=18176)[0m Map:   0%|          | 0/408 [00:00<?, ? examples/s]
Map: 100%|██████████| 408/408 [00:00<00:00, 3666.90 examples/s]
Map:   0%|          | 0/1725 [00:00<?, ? examples/s]
Map:  58%|█████▊    | 1000/1725 [00:00<00:00, 2612.20 examples/s]
Map: 100%|██████████| 1725/1725 [00:00<00:00, 2486.76 examples/s]
[2m[36m(RayTrainWorker pid=18176)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[2m[36m(RayTrainWorker pid=18176)[0m Loading `train_dataloader` to estimate number of stepping batches.
[2m[36m(RayTrainWorker pid=18176)[0m 
[2m[36m(RayTrainWorker pid=18176)[0m   | Name  | Type                                | Params
[2m[36m(RayTrainWorker pid=18176)[0m --------------------------------------------------------------
[2m[36m(RayTrainWorker pid=18176)[0m 0 | model | DistilBertForSequenceClassification | 67.0 M
[2m[36m(RayTrainWorker pid=18176)[0m --------------------------------------------------------------
[2m[36m(RayTrainWorker pid

Trial status: 24 TERMINATED | 1 RUNNING
Current time: 2023-10-17 12:40:19. Total running time: 57min 37s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrain

[2m[36m(RayTrainWorker pid=18176)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00024_24_batch_size=8,learning_rate=0.0428,warmup_steps=40_2023-10-17_12-03-15/checkpoint_000000)


Trial status: 24 TERMINATED | 1 RUNNING
Current time: 2023-10-17 12:41:19. Total running time: 58min 38s
Logical resource usage: 1.0/2 CPUs, 1.0/1 GPUs (0.0/1.0 accelerator_type:None)
Current best trial: 4753d_00021 with val_loss=0.3530758321285248 and params={'train_loop_config': {'learning_rate': 6.704010646472272e-06, 'batch_size': 4, 'warmup_steps': 70}}
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| Trial name                 status         ...fig/learning_rate     ...config/batch_size     ...nfig/warmup_steps     iter     total time (s)     train/train_loss     val/val_loss     val_loss     accuracy |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| TorchTrain

[2m[36m(RayTrainWorker pid=18176)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00024_24_batch_size=8,learning_rate=0.0428,warmup_steps=40_2023-10-17_12-03-15/checkpoint_000001)


In [10]:
results.get_best_result(metric="val_loss", mode="min")

Result(
  metrics={'train/train_loss': 0.09457603096961975, 'val/val_loss': 0.3530758023262024, 'val_loss': 0.3530758321285248, 'accuracy': 0.8602941036224365, 'f1': 0.9012131690979004, 'val/val_average_loss': 0.3530758321285248, 'train/train_average_loss': 0.36366596817970276, 'epoch': 2, 'step': 2751},
  path='/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00021_21_batch_size=4,learning_rate=0.0000,warmup_steps=70_2023-10-17_11-56-58',
  filesystem='local',
  checkpoint=Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2023-10-17_11-42-32/TorchTrainer_4753d_00021_21_batch_size=4,learning_rate=0.0000,warmup_steps=70_2023-10-17_11-56-58/checkpoint_000002)
)