In [19]:

!pip install 'ray[tune]' 'ray[default]'
!pip install -U ray[tune] pytorch-lightning torchmetrics




In [24]:
import pandas as pd
from sklearn.datasets import load_iris

# Load Iris dataset
iris = load_iris()

# Create a DataFrame
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

# Display the DataFrame
iris_df.sample(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
13,4.3,3.0,1.1,0.1,0
22,4.6,3.6,1.0,0.2,0
50,7.0,3.2,4.7,1.4,1


In [25]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

# Load and prepare the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Preprocess the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Convert arrays to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
X_test_tensor = torch.FloatTensor(X_test)
y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)

# Create datasets and dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=16, shuffle=False)

# Vanilla Pytorch




In [26]:


# Define a 3-layer Perceptron
class ThreeLayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ThreeLayerPerceptron, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.relu(self.layer1(x))
        out = self.relu(self.layer2(out))
        return self.layer3(out)

model = ThreeLayerPerceptron(input_size=4, hidden_size=100, num_classes=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop with accuracy calculation
num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for inputs, labels in train_loader:
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        total_loss += loss.item()

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Calculate average loss and accuracy
    avg_loss = total_loss / len(train_loader)
    accuracy = 100 * correct / total

    # Print every 10 epochs (or adjust to your preference)
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.2f}%')

print("Training complete")


Epoch [10/100], Loss: 0.2114, Accuracy: 95.00%
Epoch [20/100], Loss: 0.0709, Accuracy: 98.33%
Epoch [30/100], Loss: 0.0516, Accuracy: 98.33%
Epoch [40/100], Loss: 0.0487, Accuracy: 97.50%
Epoch [50/100], Loss: 0.0382, Accuracy: 99.17%
Epoch [60/100], Loss: 0.0336, Accuracy: 98.33%
Epoch [70/100], Loss: 0.0328, Accuracy: 99.17%
Epoch [80/100], Loss: 0.0270, Accuracy: 99.17%
Epoch [90/100], Loss: 0.0251, Accuracy: 99.17%
Epoch [100/100], Loss: 0.0252, Accuracy: 99.17%
Training complete


# Pytorch Lightning for Equivalent as Above

## Pytorch Lightnign Provides Abstraction and Simplification:
### Standardized Interface:
- LightningModule provides a structured interface for defining models, where you organize your computations (forward pass) separately from your optimization logic (training step). This separation makes the code cleaner and easier to understand.

### Automatic Optimization:
- By implementing configure_optimizers, PyTorch Lightning handles the backward pass and optimizer steps internally, reducing the boilerplate code needed for training loops.

### Logging and Metrics:
- Lightning's self.log method simplifies logging metrics like loss and accuracy. It automatically handles logging for both steps and epochs, integrates with various logging backends, and supports distributed environments without extra code.

### Data Handling:
- By defining data-related methods like train_dataloader, Lightning abstracts away the data loading process, making it easier to work with different data sources and splits.

### Predictions:
-  The predict_step method in LightningModule standardizes the prediction process, allowing for a clean way to define how predictions are made. The Trainer's predict method can then automatically handle batched predictions across potentially multiple dataloaders.

### Advanced Training Strategies:
- PyTorch Lightning supports advanced strategies like Distributed Data Parallel (DDP), Fully Sharded Data Parallelism (FSDP), and integration with DeepSpeed out of the box. These strategies can significantly speed up training and are enabled with simple Trainer flags, abstracting away the complexity of setting up distributed training.

In [27]:
import pytorch_lightning as pl
from pytorch_lightning import LightningModule, Trainer
from torchmetrics.classification.accuracy import Accuracy

class LightningThreeLayerPerceptron(LightningModule):
    def __init__(self, input_size=4, hidden_size=100, num_classes=3):
        super().__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, num_classes)
        self.criterion = nn.CrossEntropyLoss()
        self.accuracy = Accuracy(task='multiclass', num_classes=num_classes, average='macro')

    def forward(self, x):
        x = torch.relu(self.layer1(x))
        x = torch.relu(self.layer2(x))
        return self.layer3(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = self.accuracy(preds, y)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        self.log('train_acc', acc, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=0.001)
        return optimizer

    def train_dataloader(self):
        return train_loader

    def predict_step(self, batch, batch_idx, dataloader_idx=0):
        x, _ = batch  # Assuming your DataLoader yields a tuple (inputs, labels)
        logits = self(x)  # Forward pass to get logits
        # Optionally apply softmax for probabilities (not needed for just getting the class predictions)
        # probabilities = torch.softmax(logits, dim=1)
        return logits  # Or return `probabilities` if you applied softmax

# Initialize the PyTorch Lightning model
lightning_model = LightningThreeLayerPerceptron(input_size=4, hidden_size=100, num_classes=3)

# Set up the trainer
# example of setting up the trainer for DDP Distributed Data Parallel
# trainer = Trainer(max_epochs=10, accelerator="gpu", devices=2, strategy="ddp")

# FSDP Fully Sharded Data Parallelism
# trainer = L.Trainer(accelerator="cuda", devices=2, strategy="fsdp")

# DeepSpeed
# trainer = Trainer(accelerator="gpu", devices=4, strategy="deepspeed_stage_2_offload", precision=16)
trainer = Trainer(max_epochs=10)

# Train the model
trainer.fit(lightning_model)

# This will automatically call your `predict_step()` for each batch in `test_loader`
predictions = trainer.predict(model=lightning_model, dataloaders=test_loader)

predicted_classes = [torch.argmax(batch_logits, dim=1) for batch_logits in predictions]
# Optionally, concatenate the predictions from all batches if you want a single tensor
predicted_classes = torch.cat(predicted_classes)


# Now calculate the accuracy using tensors
correct_predictions = torch.eq(predicted_classes, y_test_tensor).sum().item()
total_predictions = y_test_tensor.size(0)
accuracy = correct_predictions / total_predictions

print(f'Accuracy: {accuracy*100:.2f}%')

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name      | Type               | Params
-------------------------------------------------
0 | layer1    | Linear             | 500   
1 | layer2    | Linear             | 10.1 K
2 | layer3    | Linear             | 303   
3 | criterion | CrossEntropyLoss   | 0     
4 | accuracy  | MulticlassAccuracy | 0     
-------------------------------------------------
10.9 K    Trainable params
0         Non-trainable params
10.9 K    Total params
0.044     Total estimated model params size (MB)
/usr/local/lib/python3.10/dist-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (8) is smaller than the log

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=10` reached.


Predicting: |          | 0/? [00:00<?, ?it/s]

Accuracy: 100.00%


# Pytorch Lightning with Ray Tune
### Ray Tune, - a library for distributed hyperparameter tuning.
### Integration with PyTorch Lightning:
-  The code demonstrates how Ray Tune can be integrated with PyTorch Lightning for model training and hyperparameter tuning, leveraging the structured approach of Lightning for defining models and training logic.

### ASHA Scheduler:
-  The Asynchronous HyperBand Scheduler (ASHA) is utilized to dynamically allocate resources to trials and early-stop less promising ones. This scheduler improves the efficiency of the tuning process by focusing on more promising hyperparameter configurations.

### Checkpointing and Reporting:
-  The TuneReportCheckpointCallback is used to report metrics back to Ray Tune and to save checkpoints during training. This callback ensures that important metrics like loss and accuracy are tracked across different trials and that the model's state can be saved at specified points (e.g., at the end of validation).


In [36]:
import os
import ray
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.integration.pytorch_lightning import TuneReportCheckpointCallback
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)


def train_iris_tune(config, num_epochs=10, num_gpus=0):
    # Data preparation
    iris = load_iris()
    X, y = iris.data, iris.target
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
    X_train_tensor = torch.FloatTensor(X_train)
    y_train_tensor = torch.LongTensor(y_train)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)

    # Model initialization with config from Ray Tune
    model = LightningThreeLayerPerceptron(
        input_size=4,
        hidden_size=config["hidden_size"],
        num_classes=3
    )

    trainer = pl.Trainer(
        max_epochs=num_epochs,
        # devices="auto",
        # accelerator="auto",
        # possibility of using RayDDPStrategy for
        # distributed data parallel training within the Ray ecosystem.
        # strategy=RayDDPStrategy(),
        callbacks=[
            TuneReportCheckpointCallback(
                metrics={"loss": "train_loss", "acc": "train_acc"},  # Ensure these match your logged names
                filename="checkpoint.ckpt",
                on="validation_end"  # or use "epoch_end" based on when you're logging metrics
            )
        ],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=True,
    )

    trainer.fit(model)

def tune_iris_model(num_samples, num_epochs, gpus_per_trial):
    config = {
        "lr": tune.loguniform(1e-4, 1e-1),
        "hidden_size": tune.choice([50, 100])
    }

    asha_scheduler = ASHAScheduler(
        metric="acc",
        mode="max",
        max_t=num_epochs,
        grace_period=1,
        reduction_factor=2
    )

    tuner = tune.Tuner(
        train_iris_tune,
        param_space=config,
        tune_config=tune.TuneConfig(scheduler=asha_scheduler),
    )
    return tuner.fit()


# Disable strict metric checking
os.environ["TUNE_DISABLE_STRICT_METRIC_CHECKING"] = "1"
# Initialize Ray
ray.shutdown()
ray.init(num_cpus=1)

# Run functions above
tune_iris_model(num_samples=2, num_epochs=2, gpus_per_trial=0)


2024-02-20 05:58:10,289	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-02-20 05:58:12,379	INFO tune.py:592 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949


+------------------------------------------------------------------------+
| Configuration for experiment     train_iris_tune_2024-02-20_05-58-12   |
+------------------------------------------------------------------------+
| Search algorithm                 BasicVariantGenerator                 |
| Scheduler                        AsyncHyperBandScheduler               |
| Number of trials                 1                                     |
+------------------------------------------------------------------------+

View detailed results here: /root/ray_results/train_iris_tune_2024-02-20_05-58-12
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/train_iris_tune_2024-02-20_05-58-12`

Trial status: 1 PENDING
Current time: 2024-02-20 05:58:12. Total running time: 0s
Logical resource usage: 0/1 CPUs, 0/0 GPUs
+--------------------------------------------------------------------+
| Trial name                    status            lr     hidden_size 

[36m(train_iris_tune pid=69743)[0m GPU available: False, used: False
[36m(train_iris_tune pid=69743)[0m TPU available: False, using: 0 TPU cores
[36m(train_iris_tune pid=69743)[0m IPU available: False, using: 0 IPUs
[36m(train_iris_tune pid=69743)[0m HPU available: False, using: 0 HPUs
[36m(train_iris_tune pid=69743)[0m Missing logger folder: /root/ray_results/train_iris_tune_2024-02-20_05-58-12/train_iris_tune_07fe4_00000_0_hidden_size=50,lr=0.0154_2024-02-20_05-58-12/lightning_logs
[36m(train_iris_tune pid=69743)[0m 2024-02-20 05:58:21.896570: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
[36m(train_iris_tune pid=69743)[0m 2024-02-20 05:58:21.896661: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
[36m(

[36m(train_iris_tune pid=69743)[0m Training: |          | 0/? [00:00<?, ?it/s]Training:   0%|          | 0/8 [00:00<?, ?it/s]Epoch 0:   0%|          | 0/8 [00:00<?, ?it/s] 
Epoch 0:  25%|██▌       | 2/8 [00:00<00:00, 15.29it/s, v_num=0, train_loss_step=1.070, train_acc_step=0.500]
Epoch 1:  75%|███████▌  | 6/8 [00:00<00:00, 130.11it/s, v_num=0, train_loss_step=0.905, train_acc_step=0.750, train_loss_epoch=1.020, train_acc_epoch=0.743]
Epoch 3:  12%|█▎        | 1/8 [00:00<00:00, 122.61it/s, v_num=0, train_loss_step=0.788, train_acc_step=0.810, train_loss_epoch=0.808, train_acc_epoch=0.820]
Epoch 4:  50%|█████     | 4/8 [00:00<00:00, 113.93it/s, v_num=0, train_loss_step=0.571, train_acc_step=0.833, train_loss_epoch=0.708, train_acc_epoch=0.806]
Epoch 5:  88%|████████▊ | 7/8 [00:00<00:00, 125.11it/s, v_num=0, train_loss_step=0.658, train_acc_step=0.704, train_loss_epoch=0.620, train_acc_epoch=0.785]
Epoch 5: 100%|██████████| 8/8 [00:00<00:00, 109.91it/s, v_num=0, train_loss_step=0.41

[36m(train_iris_tune pid=69743)[0m `Trainer.fit` stopped: `max_epochs=10` reached.


ResultGrid<[
  Result(
    metrics={},
    path='/root/ray_results/train_iris_tune_2024-02-20_05-58-12/train_iris_tune_07fe4_00000_0_hidden_size=50,lr=0.0154_2024-02-20_05-58-12',
    filesystem='local',
    checkpoint=None
  )
]>

# PyTorch with Ray Train No Pytorch Lighting
- Provides a straightforward way to scale PyTorch training to multiple workers.
- TorchTrainer automatically handles data parallelism and distribution.
- Unlike specifying the possibility of DDP (Distributed Data Parallel), distributed training is abstracted by Ray Train.
- This code is executing a single training job configured by TorchTrainer.

In [15]:
import os
from ray import train
from ray.train.torch import TorchTrainer
from ray.air.config import ScalingConfig

from torch.utils.data import DataLoader, TensorDataset
import tempfile


# Create a TensorDataset and DataLoader for the training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(dataset=train_dataset, batch_size=16, shuffle=True)

# Define a 3-layer Perceptron
class ThreeLayerPerceptron(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(ThreeLayerPerceptron, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, hidden_size)
        self.layer3 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        out = self.relu(self.layer1(x))
        out = self.relu(self.layer2(out))
        return self.layer3(out)

# Training function for Ray
def train_func(config):

    model = ThreeLayerPerceptron(input_size=4, hidden_size=100, num_classes=3)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    data_dir = os.path.join(tempfile.gettempdir(), "data")

    for epoch in range(10):  # Using 10 epochs for demonstration
        model.train()
        total_loss = 0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")
        # [3] Report metrics and checkpoint.
        metrics = {"loss": loss.item(), "epoch": epoch}
        with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
            # Directly save the state_dict of the model without using `.module`
            torch.save(
                model.state_dict(),
                os.path.join(temp_checkpoint_dir, "model.pt")
            )
            ray.train.report(
                metrics,
                checkpoint=ray.train.Checkpoint.from_directory(temp_checkpoint_dir),
            )


# Initialize Ray
ray.shutdown()
ray.init(num_cpus=1)

# Configure the training job
trainer = TorchTrainer(
    train_func,
    scaling_config=ScalingConfig(num_workers=1, use_gpu=False),
    # Configure additional parameters as needed
)

# Start the training
result = trainer.fit()
print("Training complete")

with result.checkpoint.as_directory() as checkpoint_dir:
    model_state_dict = torch.load(os.path.join(checkpoint_dir, "model.pt"))
    model = ThreeLayerPerceptron(input_size=4, hidden_size=100, num_classes=3)
    model.load_state_dict(model_state_dict)

2024-02-20 04:51:44,057	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
2024-02-20 04:51:45,642	INFO tune.py:592 -- [output] This will use the new output engine with verbosity 1. To disable the new output and use the legacy output engine, set the environment variable RAY_AIR_NEW_OUTPUT=0. For more information, please see https://github.com/ray-project/ray/issues/36949



View detailed results here: /root/ray_results/TorchTrainer_2024-02-20_04-51-45
To visualize your results with TensorBoard, run: `tensorboard --logdir /root/ray_results/TorchTrainer_2024-02-20_04-51-45`

Training started without custom configuration.


[36m(TorchTrainer pid=49895)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=49895)[0m - (ip=172.28.0.12, pid=49961) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=49961)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000000)
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000001)
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000002)



Training finished iteration 1 at 2024-02-20 04:51:57. Total running time: 12s
+-----------------------------------------+
| Training result                         |
+-----------------------------------------+
| checkpoint_dir_name   checkpoint_000000 |
| time_this_iter_s                6.26227 |
| time_total_s                    6.26227 |
| training_iteration                    1 |
| epoch                                 0 |
| loss                            0.86614 |
+-----------------------------------------+
Training saved a checkpoint for iteration 1 at: (local)/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000000

Training finished iteration 2 at 2024-02-20 04:51:57. Total running time: 12s
+-----------------------------------------+
| Training result                         |
+-----------------------------------------+
| checkpoint_dir_name   checkpoint_000001 |
| time_this_iter_s                0.00854 |
| time_tota

[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000003)
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000004)
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000005)
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000006)
[36m(RayTrainWorker pid=49961)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/r

Training saved a checkpoint for iteration 8 at: (local)/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000007

Training finished iteration 9 at 2024-02-20 04:51:58. Total running time: 12s
+-----------------------------------------+
| Training result                         |
+-----------------------------------------+
| checkpoint_dir_name   checkpoint_000008 |
| time_this_iter_s                0.00589 |
| time_total_s                      6.341 |
| training_iteration                    9 |
| epoch                                 8 |
| loss                            0.17875 |
+-----------------------------------------+
Training saved a checkpoint for iteration 9 at: (local)/root/ray_results/TorchTrainer_2024-02-20_04-51-45/TorchTrainer_bfa3c_00000_0_2024-02-20_04-51-45/checkpoint_000008

Training finished iteration 10 at 2024-02-20 04:51:58. Total running time: 12s
+-----------------------------------------+
| Training resu