# Dry Bean Variety Prediction by Multiclass Classification using ANN with Ray Tune

In [1]:
import os
import numpy as np
import pandas as pd

import torch

from torch.utils.data import TensorDataset, DataLoader, Dataset
from tqdm.notebook import tqdm
from typing import List

## Features and Target to be Used

In [2]:
x_names = [
    "Area", "Perimeter", "MajorAxisLength", "MinorAxisLength", "Eccentricity", "ConvexArea",
    "EquivDiameter", "Extent", "Solidity", "ShapeFactor1", "ShapeFactor2", "ShapeFactor3", "ShapeFactor4"]
y_name = "Class"
y_classes = ["SEKER", "BARBUNYA", "BOMBAY", "CALI", "DERMASON", "HOROZ", "SIRA"]

### Helper functions

In [3]:
# Convert Pandas dataframe to PyTorch dataset
def df_to_dataset(df: pd.DataFrame) -> Dataset:
    features = df[x_names].to_numpy(dtype=np.float32)
    # preprocess data
    df["Area"] /= 500_000
    df["Perimeter"] /= 5000
    df["MajorAxisLength"] /= 2000
    df["MinorAxisLength"] /= 2000
    df["ConvexArea"] /= 500_000
    df["EquivDiameter"] /= 2000
    # preprocess labels
    labels = df[[y_name]].applymap(lambda x: y_classes.index(x)) # map Class labels to int
    labels = labels.to_numpy(dtype=np.int64).squeeze(axis=1)
    # create the dataset
    features = torch.from_numpy(features)
    labels = torch.from_numpy(labels)
    my_dataset = TensorDataset(features, labels)
    return my_dataset

## Train Section

### Data Loader

In [4]:
def load_data(data_dir="DryBeanDataset", batch_size=64):
    df_train = pd.read_csv(os.path.join(data_dir, "dry_bean_train.csv"))
    df_val = pd.read_csv(os.path.join(data_dir, "dry_bean_val.csv"))

    ds_train = df_to_dataset(df_train)
    ds_val = df_to_dataset(df_val)
    # Create PyTorch data loader
    loader_train = DataLoader(ds_train, batch_size=batch_size, shuffle=True, drop_last=True) # drop last for stability
    loader_val = DataLoader(ds_val, batch_size=batch_size)
    return loader_train, loader_val

### Model training using Ray Tune

In [5]:
import logging
import models
import ray

from datetime import datetime
from ray import tune
from ray.air import session, RunConfig, FailureConfig, CheckpointConfig
from ray.air.checkpoint import Checkpoint
from sklearn.metrics import f1_score, accuracy_score

In [6]:
def train(config: dict, data_dir="DryBeanDataset"):
    # Load the data
    loader_train, loader_val = load_data(data_dir)
    # Which device we will use for training process (CPU/GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    # Create the model
    model = models.models[config["arch"]](n_features=len(x_names), n_classes=len(y_classes))
    # Move the model from CPU to the device
    # Actually, only required if the device is not CPU and has no effect if it is CPU
    model = model.to(device)
    # Define the loss function and the optimizer
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config["init_lr"], betas=(config["beta_1"], config["beta_2"]))
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, min_lr=1e-9)
    # Prepare the logger
    # To restore a checkpoint, use `session.get_checkpoint()`.
    loaded_checkpoint = session.get_checkpoint()
    if loaded_checkpoint:
        with loaded_checkpoint.as_directory() as loaded_checkpoint_dir:
           model_state, optimizer_state = torch.load(os.path.join(loaded_checkpoint_dir, "checkpoint.pt"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    # The training and validation process
    # Variables to hold some training status
    epoch_number = 0
    # Training loop
    for epoch in range(config["max_epochs"]):
        # Make sure gradient tracking is on, and do a pass over the data for the training process
        model.train()
        running_loss = 0.
        for i, data in enumerate(loader_train):
            # Every data instance is an input & label pair
            inputs, labels = data
            # We move the data instance from CPU to the device
            inputs = inputs.to(device)
            labels = labels.to(device)
            # Zero your gradients for every batch!
            optimizer.zero_grad()
            # Make predictions for this batch
            outputs = model(inputs)
            # Compute the loss and its gradients
            loss = loss_fn(outputs, labels)
            loss.backward()
            # Adjust learning weights
            optimizer.step()
            # Gather data and report
            running_loss += loss.detach().item()
        # Calculate the average training loss
        avg_loss = running_loss / (i + 1)

        # We don't need gradients for the model validation process
        model.eval()
        running_vloss = 0.0
        y_true = []
        y_pred = []
        with torch.no_grad():
            for i, vdata in enumerate(loader_val):
                vinputs, vlabels = vdata
                y_true.extend(vlabels.numpy().tolist())
                voutputs = model(vinputs.to(device))
                vloss = loss_fn(voutputs, vlabels.to(device))
                running_vloss += vloss.item()
                # the class with the highest energy is what we choose as prediction
                _, predicted = torch.max(voutputs.data, 1)
                y_pred.extend(predicted.cpu().numpy().tolist())

        # Calculate the average validation loss
        avg_vloss = running_vloss / (i + 1)
        scheduler.step(avg_vloss)
        # Calculate our classification metrics
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred, average="weighted")

        # Here we save a checkpoint. It is automatically registered with
        # Ray Tune and can be accessed through `session.get_checkpoint()`
        # API in future iterations.
        # Track best performance, and save the model's state (weights)
        save_dir = os.path.abspath('train')
        os.makedirs(save_dir, exist_ok=True)
        checkpoint_path = os.path.join(save_dir, 'checkpoint.pt')
        torch.save((model.state_dict(), optimizer.state_dict()), checkpoint_path)
        # Log the running loss averaged per batch
        # for both training and validation
        checkpoint = Checkpoint.from_directory(save_dir)
        session.report({
            'train_loss': avg_loss,
            'val_loss': avg_vloss,
            'val_acc': acc,
            'val_weighted_f1': f1
        }, checkpoint=checkpoint)

        epoch_number += 1

In [7]:
MAX_EPOCHS = 1000

ray.init(ignore_reinit_error=True, num_gpus=1)

param_space = {
    "arch": tune.grid_search(list(models.models.keys())),
    "init_lr": tune.loguniform(1e-2, 1e-6),
    "beta_1": tune.uniform(0.1, 0.9),
    "beta_2": tune.uniform(0.6, 0.99),
    "max_epochs": MAX_EPOCHS
}
tune_config = tune.TuneConfig(
    metric="val_weighted_f1",
    mode="max",
    scheduler=tune.schedulers.ASHAScheduler(max_t=MAX_EPOCHS,grace_period=MAX_EPOCHS//10),
    num_samples=5
)
run_config = RunConfig(
    local_dir="runs_clf_tune",
    stop=tune.stopper.TrialPlateauStopper(metric="val_loss", std=0.02, num_results=MAX_EPOCHS//20, grace_period=MAX_EPOCHS//10),
    failure_config=FailureConfig(fail_fast=True),
    sync_config=tune.SyncConfig(),
    progress_reporter=tune.JupyterNotebookReporter(),
    log_to_file=True,
    checkpoint_config=CheckpointConfig(
        num_to_keep=1,
        checkpoint_score_attribute="val_weighted_f1",
        checkpoint_score_order="max",
        checkpoint_at_end=True
    ),
)

tuner = tune.Tuner(
    tune.with_resources(
        tune.with_parameters(train, data_dir=os.path.abspath("DryBeanDataset")),
        {"cpu": 2, "gpu": 0.2}),
    tune_config=tune_config,
    param_space=param_space,
    run_config=run_config)
results = tuner.fit()

Trial name,status,loc,arch,beta_1,beta_2,init_lr,iter,total time (s),train_loss,val_loss,val_acc
train_0baf6_00000,TERMINATED,172.24.150.175:30209,MLP3Layers,0.456297,0.963005,2.25669e-06,242,159.611,0.982509,0.95766,0.674504
train_0baf6_00001,TERMINATED,172.24.150.175:30236,MLP4Layers,0.153105,0.611412,8.48668e-05,114,80.3952,0.872285,0.82679,0.677443
train_0baf6_00002,TERMINATED,172.24.150.175:30238,MLP5Layers,0.660713,0.79081,2.64964e-06,100,84.7074,1.38264,1.39001,0.37399
train_0baf6_00003,TERMINATED,172.24.150.175:30241,MLP6Layers,0.419534,0.607438,7.68568e-06,100,144.28,1.17997,1.20276,0.549596
train_0baf6_00004,TERMINATED,172.24.150.175:30244,MLP3Layers,0.492631,0.618265,0.00166074,100,65.832,1.83743,1.84069,0.249082
train_0baf6_00005,TERMINATED,172.24.150.175:30244,MLP4Layers,0.616983,0.978244,0.000268894,100,68.8361,1.83766,1.84047,0.249082
train_0baf6_00006,TERMINATED,172.24.150.175:30236,MLP5Layers,0.824499,0.794034,1.37914e-05,100,82.787,1.36034,1.34985,0.478325
train_0baf6_00007,TERMINATED,172.24.150.175:30238,MLP6Layers,0.78316,0.843288,0.000680648,100,149.215,1.8376,1.84053,0.249082
train_0baf6_00008,TERMINATED,172.24.150.175:30244,MLP3Layers,0.849601,0.682178,0.000196229,152,98.655,1.5659,1.32204,0.76194
train_0baf6_00009,TERMINATED,172.24.150.175:30241,MLP4Layers,0.598643,0.672967,0.000692481,100,69.0889,1.83775,1.84054,0.249082




[2m[36m(train pid=30209)[0m Using device: cuda
[2m[36m(train pid=30238)[0m Using device: cuda
[2m[36m(train pid=30236)[0m Using device: cuda
[2m[36m(train pid=30241)[0m Using device: cuda
[2m[36m(train pid=30244)[0m Using device: cuda
Result for train_0baf6_00000:
  date: 2022-09-05_19-48-38
  done: false
  experiment_id: 6bd7c4f9da5b45f9a7bf13f058dd95cd
  hostname: MSI
  iterations_since_restore: 1
  node_ip: 172.24.150.175
  pid: 30209
  should_checkpoint: true
  time_since_restore: 4.053349494934082
  time_this_iter_s: 4.053349494934082
  time_total_s: 4.053349494934082
  timestamp: 1662382118
  timesteps_since_restore: 0
  train_loss: 339.6617796968769
  training_iteration: 1
  trial_id: 0baf6_00000
  val_acc: 0.20132255694342396
  val_loss: 11.553337443958629
  val_weighted_f1: 0.07938693321218257
  warmup_time: 0.002660036087036133
  
Result for train_0baf6_00004:
  date: 2022-09-05_19-48-39
  done: false
  experiment_id: 3c4158612a274806ad4205dedec1348c
  hostname

2022-09-05 19:58:11,863	INFO tune.py:758 -- Total run time: 580.69 seconds (580.54 seconds for the tuning loop).


## Test Section

### Data Loader

In [8]:
def load_test_data(data_dir="DryBeanDataset", batch_size=1):
    df_test = pd.read_csv(os.path.join(data_dir, "dry_bean_test.csv"))

    ds_test = df_to_dataset(df_test)
    # Create PyTorch data loader
    loader_test = DataLoader(ds_test, batch_size=batch_size)
    return loader_test

### Test the best result

In [11]:
def test_best_model(best_result):
    # Load the data
    loader_test = load_test_data()
    # Which device we will use for training process (CPU/GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("Using device:", device)
    # Create the model
    model = models.models[best_result.config["arch"]](n_features=len(x_names), n_classes=len(y_classes))
    # Move the model from CPU to the device
    # Actually, only required if the device is not CPU and has no effect if it is CPU
    model = model.to(device)
    # Load the trained weights of the model parameters
    weight_path = os.path.join(best_result.checkpoint.to_directory(), "checkpoint.pt")
    model_state, _ = torch.load(weight_path)
    model.load_state_dict(model_state)
    # The testing process
    # We don't need gradients for the model testing process
    model.eval()
    y_true = []
    y_pred = []
    with torch.no_grad():
        for i, vdata in tqdm(enumerate(loader_test)):
            vinputs, vlabels = vdata
            y_true.extend(vlabels.numpy().tolist())
            voutputs = model(vinputs.to(device))
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(voutputs.data, 1)
            y_pred.extend(predicted.cpu().numpy().tolist())

    # Calculate our classification metrics
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, average="weighted")
    print(f'Accuracy: {acc}, Weighted F1: {f1}')


In [12]:
best_result = results.get_best_result("val_loss", "min")

print("Best trial config: {}".format(best_result.config))
print("Best trial final validation loss: {}".format(
    best_result.metrics["val_loss"]))
print("Best trial final validation accuracy: {}".format(
    best_result.metrics["val_acc"]))
print("Best trial final validation weighted F1: {}".format(
    best_result.metrics["val_weighted_f1"]))

test_best_model(best_result)

Best trial config: {'arch': 'MLP6Layers', 'init_lr': 0.00013011698059279522, 'beta_1': 0.6271696733634531, 'beta_2': 0.6816242674528422, 'max_epochs': 1000}
Best trial final validation loss: 0.269253671169281
Best trial final validation accuracy: 0.9125642909625276
Best trial final validation weighted F1: 0.9119572192332976
Using device: cuda


0it [00:00, ?it/s]

Accuracy: 0.9011756061719324, Weighted F1: 0.9010291180764783


## References
[1] https://pytorch.org/tutorials/beginner/introyt/trainingyt.html