# PreReqs

Import essential libraries, split into three chunks 
- machine learning packaages
- tuning for auto-hyperparameter selection
- linear algerba, data-manipulation and miscellanious QOL

In [1]:
# Package installation - python version must be 3.12 or below due to comptability with some modules
# "type : ignore" used to stop error reporting of non resolve (not relevant)

# ML from PyTorch
import torch # type: ignore
import torch.optim as optim # type: ignore
import torch.nn.functional as F # type: ignore
from torch import nn # type: ignore
from torch.utils.data import Dataset, DataLoader # type: ignore
from torch.utils.data import random_split  # type: ignore
from torchvision.transforms import ToTensor # type: ignore
from torch.utils.data import TensorDataset, DataLoader # type: ignore
import torchvision # type: ignore
import torchvision.transforms as transforms # type: ignore
from torchmetrics.regression import MeanAbsolutePercentageError  # type: ignore

# Tuning from Ray 
import ray # type: ignore
from ray import tune  # type: ignore
from ray import train  # type: ignore
from ray.tune import CLIReporter  # type: ignore
from ray.tune.schedulers import ASHAScheduler  # type: ignore
from ray.train import Checkpoint, get_checkpoint  # type: ignore
from ray.tune.schedulers import ASHAScheduler  # type: ignore
import ray.cloudpickle as pickle  # type: ignore


# Linear algebra, array manip and data analysis
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt  # type: ignore
from mpl_toolkits.mplot3d import Axes3D # type: ignore
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.preprocessing import StandardScaler # type: ignore

# misc
from functools import partial
import os
import tempfile
from pathlib import Path

In [2]:
# Block used if neccesary for trouble shooting to check whether CUDA is working

"""
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

"""

'\ndevice = (\n    "cuda"\n    if torch.cuda.is_available()\n    else "mps"\n    if torch.backends.mps.is_available()\n    else "cpu"\n)\nprint(f"Using {device} device")\n\n'

# Data import, conversion and preparation 

- importing the data from the csv and previewing 
- extrapolate the data into targets (I) and inputs (C)
- split the data chunks into training testing and validation
    -    The data is split into training (80%), validation (20% of the remaining 80%), and test sets using train_test_split
- The data is then normalised with StandardScaler and converted into pytorch tensors (now compatible with learning)


In [3]:
# Pandas data import 
data = pd.read_csv('robot_inverse_kinematics_dataset.csv')
data.head()

Unnamed: 0,q1,q2,q3,q4,q5,q6,x,y,z
0,-1.51,-0.763,1.85,-0.817,0.912,2.32,-0.0947,0.15,0.301
1,-2.84,0.52,1.58,-1.27,-1.39,0.617,0.142,-0.1,0.225
2,-1.23,0.695,1.22,-1.13,0.0343,6.27,-0.0833,0.223,0.206
3,-1.99,1.06,1.74,-1.76,-1.24,4.76,0.135,-0.0314,0.37
4,1.05,0.836,1.34,-1.89,0.484,4.38,-0.056,-0.229,0.26


In [4]:
# extrap data and create arrays
I = data[['x','y','z']].values
C = data[['q1','q2','q3']].values

In [5]:
"""
- split the data into training and testing sets
- 80% of the data will be used for training, 20% for testing
- the random state is set to 30 to ensure the same result each time the function is called
- the data is split into 4 arrays: I_train, I_test, C_train, C_test
- I_train and C_train will be used to train the model
- I_test and C_test will be used to test the model
- the shape of the data is printed to confirm the split
"""

# splits the data into 80% training and 20% testing for both I and C
I_train,I_test,C_train,C_test=train_test_split(I,C,test_size=0.2,random_state=30)
# arrays (I,C) , split size (20%) , int in random state allows for same result each time func is called 

# splits the data again , out of the 80 (training) take another 20 for validation 
I_train,I_val,C_train,C_val=train_test_split(I_train,C_train,test_size=0.25,random_state=30)
# arrays (I and C train) , 25% , rand. int

# print the shape of the data to confirm the splits
print(f"I_train: {I_train.shape}, C_train: {C_train.shape}")
print(f"I_test: {I_test.shape}, C_test: {C_test.shape}")
print(f"I_val: {I_val.shape}, C_val: {C_val.shape}")

I_train: (9000, 3), C_train: (9000, 3)
I_test: (3000, 3), C_test: (3000, 3)
I_val: (3000, 3), C_val: (3000, 3)


In [6]:
"""
- Normalise the input features and target values
- Fit the scalers on the training data and transform all splits
- Convert to PyTorch tensors
"""

# Normalise the input features and target values
scaler_input = StandardScaler()
scaler_output = StandardScaler()


# Fit the scalers on the training data and transform all splits
I_train = scaler_input.fit_transform(I_train)
I_val = scaler_input.transform(I_val)
I_test = scaler_input.transform(I_test)

C_train = scaler_output.fit_transform(C_train)
C_val = scaler_output.transform(C_val)
C_test = scaler_output.transform(C_test)

# Convert to PyTorch tensors
I_train = torch.tensor(I_train, dtype=torch.float32)
I_val = torch.tensor(I_val, dtype=torch.float32)
I_test = torch.tensor(I_test, dtype=torch.float32)

C_train = torch.tensor(C_train, dtype=torch.float32)
C_val = torch.tensor(C_val, dtype=torch.float32)
C_test = torch.tensor(C_test, dtype=torch.float32)

# Model section

- a multi-layer precepteron regressor class is defined
    - 3 points on input and output layer - to match data (x,y,z) -> (p1,p2,p3)
    - l1, l2 and l3 hidden layer configurations
    - dropout layers between each to prevent over fitting

In [7]:
"""
- Define the MLPRegressor class
- The class inherits from nn.Module
- The class has a constructor that takes the following parameters:
    - l1: number of neurons in the first hidden layer
    - l2: number of neurons in the second hidden layer
    - l3: number of neurons in the third hidden layer
    - activation: activation function to use in the hidden layers
    - dropout_prob: dropout probability
"""

class MLPRegressor(nn.Module):

    # Define the model architecture
    def __init__(self, l1=120, l2=84, l3=10, activation='Tanh', dropout_prob=0.2): # set defaults here 
        # Call the parent class constructor
        super(MLPRegressor, self).__init__()

        # Dynamically choose the activation function based on config
        self.activation = getattr(nn, activation.capitalize(), nn.Tanh)() 

        # Define hidden layer params 
        self.hidden_layers = nn.Sequential(
            nn.Linear(3, l1), # input layer -> hidden
            self.activation, # acitviation function configured in search space
            nn.Dropout(p=dropout_prob), # drop out layer - probability config in search space
            nn.Linear(l1, l2), # hidden 1 -> 2
            self.activation,
            nn.Dropout(p=dropout_prob),
            nn.Linear(l2, l3), # hidden 2 -> 3
            self.activation,
            nn.Dropout(p=dropout_prob)
        )
        
        # Define output layer with 3 outputs
        self.output_layer = nn.Linear(l3, 3)  # hidden 3 -> output
        
    # Define the forward pass    
    def forward(self, x): # x is the input data
        x = self.hidden_layers(x) # pass through hidden layers
        return self.output_layer(x) # pass through output layer

# Training function

- The train function accepts a configuration dictionary for hyperparameters (l1, l2) and trains the MLPRegressor model.
- Loss is computed with nn.MSELoss, and the model is optimized using NAdam. The script also handles checkpointing with Ray Tune.
- Validation Metrics: The script calculates validation loss and Mean Absolute Percentage Error (MAPE) for each epoch.

In [8]:
def train(config, data_dir=None, max_epochs=1000):

    # Initialize the checkpoint directory outside of any conditionals
    checkpoint_dir = "./checkpoints"  # This ensures it's defined before use
    os.makedirs(checkpoint_dir, exist_ok=True) # create the directory if it doesn't exist

    """
    Preliminary section
     - deining the mode
     - device allocation
     - selecting criterion and optimiser
     - creation of the data loaders for train and eval
     - checkpointing logic
    """
    
    # define model with variables configurable in the mlpr class def 
    model = MLPRegressor(
        l1=config["l1"],
        l2=config["l2"],
        l3=config["l3"],
        activation=config["activation"],
        dropout_prob=config["dropout_prob"]
    )
    
    # use the cuda device if available
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    # check for parralel computing ability 
    if torch.cuda.device_count() > 1:
        model = nn.DataParallel(model)
    # assign the model to the chosen device
    model.to(device)
    
    # set for regression using mean squared error loss
    criterion = nn.MSELoss()

    # Each optimiser has its own variables so they are defined here by checking which one is being used
    if config["optimiser"] == "NAdam":
        optimiser = optim.NAdam(model.parameters(), lr=config["learning_rate_init"], betas=(config["momentum"], 0.999))
    elif config["optimiser"] == "AdamW":
        optimiser = optim.AdamW(model.parameters(), lr=config["learning_rate_init"], betas=(config["momentum"], 0.999), weight_decay=1e-4)
    elif config["optimiser"] == "RAdam":
        optimiser = optim.RAdam(model.parameters(), lr=config["learning_rate_init"], betas=(config["momentum"], 0.999))
    elif config["optimiser"] == "SGD":
        optimiser = optim.SGD(model.parameters(), lr=config["learning_rate_init"], momentum=config["momentum"], weight_decay=1e-4)
    elif config["optimiser"] == "Adam":
        optimiser = optim.Adam(model.parameters(), lr=config["learning_rate_init"], betas=(config["momentum"], 0.999))


    # For batch sizing to be available we need to use data loading 

    # Training dataset
    train_loader = DataLoader(
        TensorDataset(I_train, C_train),
        batch_size=config["batch_size"],
        shuffle=True
    )
    # Validation dataset
    val_loader = DataLoader(
        TensorDataset(I_val, C_val),
        batch_size=config["batch_size"]
    )

    # fetch the checkpoint for resuming the trial
    checkpoint = get_checkpoint() # get the checkpoint from the trial
    if checkpoint: # if there is a checkpoint
        with checkpoint.as_directory() as checkpoint_dir: # open the directory
            data_path = Path(checkpoint_dir) / "data.pkl" # get the data path
            with open(data_path, "rb") as fp:           # open the file
                checkpoint_state = pickle.load(fp)      # load the checkpoint state
            start_epoch = checkpoint_state["epoch"]     # get the epoch from the checkpoint
            model.load_state_dict(checkpoint_state["model_state_dict"]) # load the model state
            optimiser.load_state_dict(checkpoint_state["optimizer_state_dict"]) # load the optimiser state
    else: # if there is no checkpoint
        start_epoch = 0 # start the epoch at 0
    
    """
    Training and evaluation loops 
    - loops over a max defined range in the function definition 
    - train first 
    - check results using eval
    """

    for epoch in range(start_epoch, max_epochs):  # loop over the dataset multiple times


        """
        Train section
        - put into training mode
        - init the variables used to track the training progress
        - run  forwards and backwards over the model
        """

        model.train()
        running_loss = 0.0
        # epoch_steps = 0

        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)

            # forward + backward + optimize
            optimiser.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimiser.step()
            running_loss += loss.item()
        

        """
        Eval section
        - put into eval mode
        - init tracking variables
        - execute model validation using val set
        """

        # eval mode
        model.eval()

        # Validation variable inits
        val_loss = 0.0
        mape_total = 0.0

        with torch.no_grad(): 
            for inputs, targets in val_loader:

                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)

                # Calculate loss
                val_loss += criterion(outputs, targets).item()

                # Calculate MAPE
                mapecalc = MeanAbsolutePercentageError().to(device)
                mape_total += mapecalc(outputs, targets).item()

        
        """
        Tracking section 
        - track the running average of the stats
        - keep a checkpoint of the trail
        - report stats into the ray tune progress reporter
        """

        # track the average of the eval statistics 

        avg_val_loss = val_loss / len(val_loader)
        avg_mape = mape_total / len(val_loader)

        # keep a checkpoint of the trails as they run 
        checkpoint_data = {
            "epoch": epoch,
            "model_state_dict": model.state_dict(),
            "optimizer_state_dict": optimiser.state_dict(),
        }
        # checkpoint saving
          # Create a temporary directory for each checkpoint
        with tempfile.TemporaryDirectory() as checkpoint_dir:
            data_path = os.path.join(checkpoint_dir, "data.pkl")
            with open(data_path, "wb") as fp:
                pickle.dump(checkpoint_data, fp)

            # Provide the checkpoint to Ray
            checkpoint = Checkpoint.from_directory(checkpoint_dir)

            # report the stats to the ray tune progress reporter
            ray.train.report(
                {"loss": avg_val_loss, "mape": avg_mape},
                checkpoint=checkpoint
            )


    
    print("Finished Training")

# Hyperparmater tuning with ray tune 

The main function is the core of the tuning
- sets up Ray Tune to optimize hyperparameters
    - activation
    - alpha
    - layer sizes
    - learning rate
    - iterations
    - tol
    - momentum
    - val frac
- An ASHA Scheduler (ASHAScheduler) is used to terminate underperforming trials early.
- Resource Allocation: The script dynamically assigns CPU and GPU resources based on availability.

In [9]:
def main(num_samples=10, max_num_epochs=1000): 

    """
    - Allocate resources based on the available resources
    - Define the hyper-parameter search space
    - Define the early stopping scheduler
    - Define the checkpoint directory
    - Run the hyper-parameter search
    - Load the best checkpoint
    - Save the best trained model
    """

    # Allocate based on available resources
    resources = ray.cluster_resources()
    available_cpus = int(resources.get("CPU", 0))
    available_gpus = int(resources.get("GPU", 0))

    resources_per_trial = {
        "cpu": max(1, available_cpus // num_samples),
        "gpu": max(1, available_gpus) if available_gpus > 0 else 0
    }

    # hyper-parameter search area 
    config = {
        "activation": tune.choice(['tanh', 'RReLU', 'Hardtanh', 'identity','LeakyReLU']),
        "alpha": tune.loguniform(1e-5, 1e-2),
        "batch_size": tune.choice([8, 32, 128, 256]),
        "l1": tune.choice([2**i for i in range(12)]),
        "l2": tune.choice([2**i for i in range(12)]),
        "l3": tune.choice([2**i for i in range(11)]),
        "learning_rate_init": tune.loguniform(1e-6, 1e-1),
        "max_iter": tune.choice([4000,10000,15000,20000]),
        "tol": tune.loguniform(1e-4, 1e-3),
        "momentum": tune.uniform(0.5, 0.99),
        "validation_fraction": tune.uniform(0.1, 0.3),
        "dropout_prob": tune.uniform(0.1,0.3),
        "optimiser": tune.choice(['NAdam', 'AdamW', 'RAdam', 'SGD', 'Adam'])
    }
    
    # early stopping for trials with bad mape values
    scheduler = ASHAScheduler(
        metric="mape",
        mode="min",
        max_t=max_num_epochs,
        grace_period=10,
        reduction_factor=2,
    )

    # checkpoint bugfix for names being too long
    def short_dirname(trial):
        return "trial_" + str(trial.trial_id)

    # send values into the progress reporter in the output 
    result = tune.run(
        partial(train, max_epochs=max_num_epochs),
        resources_per_trial=resources_per_trial,
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        trial_dirname_creator=short_dirname
    )

    # get best trial 
    best_trial = result.get_best_trial("mape", "min", "last")
    print(f"Best trial config: {best_trial.config}")
    print(f"Best trial final mape: {best_trial.last_result['mape']}")


    # Define the best trained model with the same hyper parameters decided in the config space 
    best_trained_model = MLPRegressor(
        l1=best_trial.config["l1"],
        l2=best_trial.config["l2"],
        l3=best_trial.config["l3"],
        activation=best_trial.config["activation"],
        dropout_prob=best_trial.config["dropout_prob"],
        )

    # Device selection 
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    if torch.cuda.device_count() > 1:
        best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)


    # Load the best checkpoint
    best_checkpoint = result.get_best_checkpoint(best_trial, metric="mape", mode="min")
    if best_checkpoint:
        with best_checkpoint.as_directory() as checkpoint_dir:
            checkpoint_path = Path(checkpoint_dir) / "data.pkl"
            with open(checkpoint_path, "rb") as fp:
                checkpoint_state = pickle.load(fp)
            best_trained_model.load_state_dict(checkpoint_state["model_state_dict"])
        print(f"Best checkpoint loaded from: {best_checkpoint}")
    else:
        print("No checkpoint found for the best trial.")

    # Save the best trained model
    torch.save(best_trained_model,"BenchTrain.pickle")


In [10]:
# ray init function
if not ray.is_initialized():
        ray.init()
# main function call
if __name__ == "__main__":
    # check for ray init
    if not ray.is_initialized():
        ray.init()

    try:
        main()
    # catch the error if the file is not found
    except FileNotFoundError as e:
        print(f"FileNotFoundError: {e}")

2025-03-12 16:26:11,501	INFO worker.py:1816 -- Started a local Ray instance.
2025-03-12 16:26:26,566	INFO tune.py:616 -- [output] This uses the legacy output and progress reporter, as Jupyter notebooks are not supported by the new engine, yet. For more information, please see https://github.com/ray-project/ray/issues/36949


0,1
Current time:,2025-03-12 17:27:52
Running for:,01:01:26.23
Memory:,26.9/127.8 GiB

Trial name,status,loc,activation,alpha,batch_size,dropout_prob,l1,l2,l3,learning_rate_init,max_iter,momentum,optimiser,tol,validation_fraction,iter,total time (s),loss,mape
train_bee0a_00000,TERMINATED,127.0.0.1:18384,RReLU,0.000476787,256,0.230825,2,1024,1,0.0202865,4000,0.580869,SGD,0.00011921,0.191677,1000,219.571,0.859657,3.81555
train_bee0a_00001,TERMINATED,127.0.0.1:29596,Hardtanh,6.44543e-05,8,0.215015,128,16,1,1.84527e-05,15000,0.679625,SGD,0.000625081,0.172068,1000,2985.07,0.856742,4.14392
train_bee0a_00002,TERMINATED,127.0.0.1:3836,Hardtanh,0.00946217,128,0.158995,32,8,512,5.81353e-05,20000,0.818895,AdamW,0.000146058,0.252481,10,4.32235,0.828817,3.67107
train_bee0a_00003,TERMINATED,127.0.0.1:37408,identity,0.00021863,32,0.229722,256,8,1024,2.30286e-05,20000,0.644548,NAdam,0.000196257,0.263387,10,9.753,0.834694,4.21109
train_bee0a_00004,TERMINATED,127.0.0.1:44188,LeakyReLU,5.79988e-05,8,0.227304,2048,256,256,0.0198996,4000,0.623151,RAdam,0.000329838,0.200933,10,35.9598,1.69265,9.78976
train_bee0a_00005,TERMINATED,127.0.0.1:30156,RReLU,0.000706371,128,0.268343,4,2,8,1.12611e-05,10000,0.958611,AdamW,0.00024023,0.151136,1000,307.924,0.851704,4.32714
train_bee0a_00006,TERMINATED,127.0.0.1:5316,Hardtanh,0.0032281,32,0.272121,1024,512,64,0.0252332,10000,0.849442,Adam,0.000156901,0.114805,10,11.1234,0.943626,6.05975
train_bee0a_00007,TERMINATED,127.0.0.1:16796,tanh,0.000322231,32,0.11229,128,256,64,1.20142e-05,15000,0.797851,AdamW,0.000259223,0.19856,10,10.2368,0.825949,4.66842
train_bee0a_00008,TERMINATED,127.0.0.1:28728,identity,0.00163248,32,0.244077,16,64,1,0.00131645,10000,0.79259,SGD,0.000534159,0.161611,20,17.0681,0.864413,3.20901
train_bee0a_00009,TERMINATED,127.0.0.1:29004,Hardtanh,0.000108889,256,0.213869,2048,1,512,0.000955787,4000,0.765874,AdamW,0.00014733,0.287208,20,5.37937,0.899102,4.04975


Trial name,loss,mape,should_checkpoint
train_bee0a_00000,0.859657,3.81555,True
train_bee0a_00001,0.856742,4.14392,True
train_bee0a_00002,0.828817,3.67107,True
train_bee0a_00003,0.834694,4.21109,True
train_bee0a_00004,1.69265,9.78976,True
train_bee0a_00005,0.851704,4.32714,True
train_bee0a_00006,0.943626,6.05975,True
train_bee0a_00007,0.825949,4.66842,True
train_bee0a_00008,0.864413,3.20901,True
train_bee0a_00009,0.899102,4.04975,True


[36m(func pid=18384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/2450027/ray_results/train_2025-03-12_16-26-26/trial_bee0a_00000/checkpoint_000000)
[36m(func pid=18384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/2450027/ray_results/train_2025-03-12_16-26-26/trial_bee0a_00000/checkpoint_000001)
[36m(func pid=18384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/2450027/ray_results/train_2025-03-12_16-26-26/trial_bee0a_00000/checkpoint_000002)
[36m(func pid=18384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/2450027/ray_results/train_2025-03-12_16-26-26/trial_bee0a_00000/checkpoint_000003)
[36m(func pid=18384)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=C:/Users/2450027/ray_results/train_2025-03-12_16-26-26/trial_bee0a_00000/checkpoint_000004)
[36m(func pid=18384)[0m Checkpoint successfully created at

Best trial config: {'activation': 'identity', 'alpha': 0.001632478152832471, 'batch_size': 32, 'l1': 16, 'l2': 64, 'l3': 1, 'learning_rate_init': 0.0013164538084653437, 'max_iter': 10000, 'tol': 0.0005341589437214072, 'momentum': 0.7925901612309569, 'validation_fraction': 0.16161121463818579, 'dropout_prob': 0.24407722437624688, 'optimiser': 'SGD'}
Best trial final mape: 3.209007792016293
Best checkpoint loaded from: Checkpoint(filesystem=local, path=C:/Users/2450027/ray_results/train_2025-03-12_16-26-26/trial_bee0a_00008/checkpoint_000001)
