In [0]:
import os
import warnings

warnings.filterwarnings("ignore")  # avoid printing out absolute paths

os.chdir("../../..")

In [0]:
import warnings

import lightning.pytorch as pl
# from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor
# from lightning.pytorch.loggers import TensorBoardLogger
import numpy as np
import pandas as pd
import torch
import ray
from ray.train import lightning as rtl
from ray.train.torch import TorchTrainer
from ray import train, tune
import mlflow
from ray.air.integrations.mlflow import MLflowLoggerCallback, setup_mlflow
from ray.tune.integration.pytorch_lightning import TuneReportCallback
import tempfile
import torch.nn.functional as F
from filelock import FileLock
from torchmetrics import Accuracy
from torch.utils.data import DataLoader, random_split
from torchvision.datasets import MNIST
from torchvision import transforms


# Setup ray cluster

In [0]:
from ray.util.spark import setup_ray_cluster, shutdown_ray_cluster, MAX_NUM_WORKER_NODES
num_cpu_cores_per_worker = 4 # total cpu''s present in each node
num_gpu_per_worker = 1 # total gpu''s present in each node
use_gpu = True
# username = dbutils.notebook.entry_point.getDbutils().notebook().getContext().tags().apply()


try: 
  shutdown_ray_cluster()
except:
  print("No Ray cluster is initiated")

try: 
  ray.shutdown()
except:
  print("No Ray cluster is initiated")

# Start the ray cluster and follow the output link to open the Ray Dashboard - a vital observability tool for understanding your infrastructure and application.
setup_ray_cluster(
  max_worker_nodes=2, # define your number of worker here
  num_cpus_per_node=num_cpu_cores_per_worker,
  num_gpus_per_node=num_gpu_per_worker,
)
runtime_env = {"pip": ["lightning", "torch","pytorch_forecasting"]}
ray.init(address="auto", ignore_reinit_error=True,runtime_env=runtime_env)

cluster_resources = ray.cluster_resources()
print(cluster_resources)

num_workers = int(cluster_resources["CPU"] / num_cpu_cores_per_worker)

# Model and DataLoader

## Write custome model and dataloader

In [0]:
# Using pytorch lightning module to define model 
class MNISTClassifier(pl.LightningModule):
    def __init__(self, config):
        super(MNISTClassifier, self).__init__()
        self.accuracy = Accuracy(task="multiclass", num_classes=10, top_k=1)
        self.layer_1_size = config["layer_1"]
        self.layer_2_size = config["layer_2"]
        self.lr = config["lr"]

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
        self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
        self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)
        self.eval_loss = []
        self.eval_accuracy = []

    def cross_entropy_loss(self, logits, labels):
        return F.nll_loss(logits, labels)

    def forward(self, x):
        batch_size, channels, width, height = x.size()
        x = x.view(batch_size, -1)

        x = self.layer_1(x)
        x = torch.relu(x)

        x = self.layer_2(x)
        x = torch.relu(x)

        x = self.layer_3(x)
        x = torch.log_softmax(x, dim=1)

        return x

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        self.log("ptl/train_loss", loss)
        self.log("ptl/train_accuracy", accuracy)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)
        self.eval_loss.append(loss)
        self.eval_accuracy.append(accuracy)
        return {"val_loss": loss, "val_accuracy": accuracy}

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.eval_loss).mean()
        avg_acc = torch.stack(self.eval_accuracy).mean()
        self.log("ptl/val_loss", avg_loss, sync_dist=True)
        self.log("ptl/val_accuracy", avg_acc, sync_dist=True)
        self.eval_loss.clear()
        self.eval_accuracy.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer


class MNISTDataModule(pl.LightningDataModule):
    def __init__(self, data_dir=None, batch_size=128):
        super().__init__()
        if data_dir is None:
          self.data_dir = tempfile.mkdtemp()
        else: 
          self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transforms.Compose(
            [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))]
        )

    def setup(self, stage=None):
        with FileLock(f"{self.data_dir}.lock"):
            mnist = MNIST(
                self.data_dir, train=True, download=True, transform=self.transform
            )
            self.mnist_train, self.mnist_val = random_split(mnist, [55000, 5000])

            self.mnist_test = MNIST(
                self.data_dir, train=False, download=True, transform=self.transform
            )

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=self.batch_size, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=self.batch_size, num_workers=4)

    def test_dataloader(self):
        return DataLoader(self.mnist_test, batch_size=self.batch_size, num_workers=4)

## Define training function

In [0]:
def tune_train_func(config, data_dir=None, num_gpus=1):
    # setup mlflow for logging
    
    # define model
    model = MNISTClassifier(config)
    # define the data module/data loader
    dm = MNISTDataModule(
        data_dir=data_dir, batch_size=config["batch_size"]
    )
    ########## YOUR CODE HERE ##########
    #set up the metrics mapping
    
    #enable auto logging

    # setup the trainer for pytorch lightning
    
    ########################################
    # fit the trainer
    trainer.fit(model, dm)


# Distributed training with Ray Tune

In [0]:
def tune_mnist(
    config,
    num_samples=10,
    gpus_per_trial=0,
):
    data_dir = os.path.join(tempfile.gettempdir(), "mnist_data_")
    # Download data
    MNISTDataModule(data_dir=data_dir, batch_size=config['batch_size']).prepare_data()


    ########## YOUR CODE HERE ##########
    # Set the MLflow experiment, or create it if it does not exist.


    # make your train function work with ray tune
    trainable
    # define tune config 
    tune_config
    # define run config
    run_config 

    # setup tuner
    tuner = 
    ########################################
    # fit the tuner
    results = tuner.fit()

    print("Best hyperparameters found were: ", results.get_best_result().config)
    return results.get_best_result().config

In [0]:
tune_config = {
  "layer_1": tune.choice([10, 20, 30]), # add param space
  "layer_2": tune.choice([20, 30, 40]),
  "lr": tune.loguniform(1e-4, 1e-1),
  "batch_size": tune.choice([64, 128]),
  'num_epochs': 5,
  "tracking_uri": mlflow.get_tracking_uri(),
  "experiment_id":
  "experiment_name":
}

In [0]:
best_config = tune_mnist(config=tune_config,
          num_samples=10,
          gpus_per_trial=1,
          )

# Distributed training with Ray Train

## Define train function

In [0]:
def train_func(config):

    # define model
    model = MNISTClassifier(config)
    # define the data module/data loader
    dm = MNISTDataModule(data_dir=os.path.join(tempfile.gettempdir(), "mnist_data_"), batch_size=config["batch_size"])

    ########## YOUR CODE HERE ##########
    # setup the trainer for pytorch lightning
    
    # prepare trainer for ray train

    ########################################
    # fit the trainer
    trainer.fit(model,datamodule=dm)


In [0]:
train_config = best_config.copy()
train_config['num_epochs'] = 10

## Fit the model with Ray Train

In [0]:
########## YOUR CODE HERE ##########
# define scaling config
scaling_config

# define run config 
run_config
########################################
trainer = ray.train.torch.TorchTrainer(train_func, train_loop_config=train_config, scaling_config=scaling_config,run_config=run_config)

result = trainer.fit()

## Training with single GPU

In [0]:
scaling_config = ray.train.ScalingConfig(num_workers=1,
                                        use_gpu=True
                                        )
trainer = ray.train.torch.TorchTrainer(train_func, train_loop_config=train_config, scaling_config=scaling_config,run_config=run_config)

result = trainer.fit()