This file contains a basic squeme for training and test of a Multilayer Perceptron (MLP) Neural Network (NN) using Matrix-assisted laser desorption/ionization–time of flight (MALDI-TOF) mass spectrometry (MS) Data. The code is adapted to work with PyTorch Lightning.

NOTE : The use of cuda depends on the GPU available, modern versions of cuda doesn't work with old GPUs and some old versions of cuda doesn't work with modern versions of pytorch and numpy. Check the dependencies before running.

In [1]:
from maldi_zsl.data import SpeciesClfDataModule
from lightning.pytorch import Trainer, seed_everything
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from maldi_zsl.models import SpeciesClassifier

dm = SpeciesClfDataModule( #Is use to load the data on h5 format (data stored on a "dictionary like" format to ease the access)
    "../Data/RKIbin.h5", # or "RKIbin.h5"
    batch_size=128, # important parameter to tune, is the number of instances that are going to be considered on each train iteration
    n_workers=2, # increase to use more CPU power
)

dm.setup(None)

model = SpeciesClassifier(
    mlp_kwargs={ #On this dictionary is the basic configuration of the NN architecture
        "n_inputs" : 6000, # keep constant
        "n_outputs": dm.n_species, #is different for DRIAMS and RKI
        "layer_dims": [512, 256], # decides how big the network is
        "layer_or_batchnorm": "layer",
        "dropout": 0.2,
    },
    lr=1e-4, # important to tune
    weight_decay=0, # this you can keep constant
    lr_decay_factor=1.00, # this you can keep constant
    warmup_steps=250, # this you can keep constant
)

In [12]:
import h5py
import torch
f = h5py.File("../Data/RKIbin.h5", "r")
f.visititems(print)

0 <HDF5 group "/0" (2 members)>
0/intensity <HDF5 dataset "intensity": shape (11055, 6000), type "<f8">
0/loc <HDF5 dataset "loc": shape (11055,), type "|S158">
central <HDF5 dataset "central": shape (11055,), type "<i8">
unstructured <HDF5 group "/unstructured" (3 members)>
unstructured/mz <HDF5 dataset "mz": shape (6000,), type "<f8">
unstructured/species_labels <HDF5 dataset "species_labels": shape (270,), type "|S41">
unstructured/split <HDF5 dataset "split": shape (11055,), type "|S5">


In [14]:
count = {}
for split in f["unstructured/split"]:
    if split in count: count[split]+=1
    else: count[split] = 1
count

{b'train': 8442, b'val': 1350, b'test': 1263}

In [3]:
c = 0
for minibatch in iter(dm.train_dataloader()):
    for i in minibatch:
        c+=1
print(c)



    Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    


198


In [5]:
val_ckpt = ModelCheckpoint(monitor="val_acc", mode="max")
callbacks = [val_ckpt, EarlyStopping(monitor="val_acc", patience=10, mode="max")]
logger = TensorBoardLogger("../logs_folder", name="test_run") # Ctrl+Shift+P

trainer = Trainer( #lightning module for NN training
    accelerator="gpu",
    devices=[0],
    strategy="auto",
    max_epochs=100,
    callbacks=callbacks,
    logger=logger,
)

trainer.fit(model, dm.train_dataloader(), dm.val_dataloader())

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name              | Type               | Params
---------------------------------------------------------
0 | spectrum_embedder | MLP                | 3.3 M 
1 | accuracy          | MulticlassAccuracy | 0     
2 | top5_accuracy     | MulticlassAccuracy | 0     
---------------------------------------------------------
3.3 M     Trainable params
0         Non-trainable params
3.3 M     Total params
13.099    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Epoch 44: 100%|██████████| 66/66 [00:01<00:00, 51.20it/s, v_num=12]        


In [6]:
res = trainer.validate(model, dm.val_dataloader(), ckpt_path=val_ckpt.best_model_path)

Restoring states from the checkpoint path at ../logs_folder/test_run/version_12/checkpoints/epoch=34-step=2310.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]
Loaded model weights from the checkpoint at ../logs_folder/test_run/version_12/checkpoints/epoch=34-step=2310.ckpt


Validation DataLoader 0: 100%|██████████| 11/11 [00:00<00:00, 123.98it/s]


# HPTunning

In [7]:
# Layers, lr, dropout
import os
from torch import manual_seed as torch_manual_seed
from numpy.random import seed as np_random_seed
from torch.cuda import is_available as torch_cuda_is_available
from torch.cuda import manual_seed as torch_cuda_manual_seed
from torch.cuda import manual_seed_all as torch_cuda_manual_seed_all
import random

def set_seed(seed):
    seed_everything(seed, workers=True)
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np_random_seed(seed)
    torch_manual_seed(seed)
    if torch_cuda_is_available():
        torch_cuda_manual_seed(seed)
        torch_cuda_manual_seed_all(seed)

In [11]:
corpus = """
\\begin{table}
\caption{Results of the hyperparameter tunning considering the general accuracy.}
\centering
\small
\\begin{tabular}{l c c c}
\\toprule
\multicolumn{3}{c}{Parameter} & \multicolumn{1}{c}{Result} \\ \cmidrule(lr){1-3} \cmidrule(lr){4-4}
lr & dropout & mlp\_hid & Accuracy\\
\midrule"""

config = {
    "lr": [1e-5, 1e-4, 1e-3],
    "dropout" : [0.2,0.4,0.6],
    "mlp_hid" : [[256, 256],[512, 256],[512, 256, 256],[512, 512, 256]],
}

In [None]:
for lr in config["lr"]:
    for dropout in config["dropout"]:
        for mlp_hid in config["mlp_hid"]:
            model = SpeciesClassifier(
                mlp_kwargs={ #On this dictionary is the basic configuration of the NN architecture
                    "n_inputs" : 6000, # keep constant
                    "n_outputs": dm.n_species, #is different for DRIAMS and RKI
                    "layer_dims": mlp_hid, # decides how big the network is
                    "layer_or_batchnorm": "layer",
                    "dropout": dropout,
                },
                lr=lr, # important to tune
                weight_decay=0, # this you can keep constant
                lr_decay_factor=1.00, # this you can keep constant
                warmup_steps=250, # this you can keep constant
            )

            val_ckpt = ModelCheckpoint(monitor="val_acc", mode="max")
            callbacks = [val_ckpt, EarlyStopping(monitor="val_acc", patience=10, mode="max")]
            logger = TensorBoardLogger("../logs_folder", name="tune_run") # Ctrl+Shift+P
            trainer = Trainer( 
                accelerator="gpu",
                devices=[0],
                strategy="auto",
                max_epochs=100,
                callbacks=callbacks,
                logger=logger,
            )
            trainer.fit(model, dm.train_dataloader(), dm.val_dataloader())
            res = trainer.validate(model, dm.val_dataloader(), ckpt_path=val_ckpt.best_model_path)
            acc = res[0]["val_acc"]
            row=f"""
{lr} & {dropout} & {mlp_hid} & {round(acc, 4)} \\\\"""
            corpus+=row
tail = """
\\bottomrule
\end{tabular}
\caption{Hyperpameter tunning of the data}
\end{table}


"""
corpus+=tail

In [20]:
print(corpus)


\begin{table}
\caption{Results of the hyperparameter tunning considering the general accuracy.}
\centering
\small
\begin{tabular}{l c c c}
\toprule
\multicolumn{3}{c}{Parameter} & \multicolumn{1}{c}{Result} \ \cmidrule(lr){1-3} \cmidrule(lr){4-4}
lr & dropout & mlp\_hid & Accuracy\
\midrule
1e-05 & 0.2 & [256, 256] & 0.8696 \\
1e-05 & 0.2 & [512, 256] & 0.8607 \\
1e-05 & 0.2 & [512, 256, 256] & 0.8681 \\
1e-05 & 0.2 & [512, 512, 256] & 0.883 \\
1e-05 & 0.4 & [256, 256] & 0.8733 \\
1e-05 & 0.4 & [512, 256] & 0.8763 \\
1e-05 & 0.4 & [512, 256, 256] & 0.843 \\
1e-05 & 0.4 & [512, 512, 256] & 0.8622 \\
1e-05 & 0.6 & [256, 256] & 0.8156 \\
1e-05 & 0.6 & [512, 256] & 0.8511 \\
1e-05 & 0.6 & [512, 256, 256] & 0.0867 \\
1e-05 & 0.6 & [512, 512, 256] & 0.0615 \\
0.0001 & 0.2 & [256, 256] & 0.9022 \\
0.0001 & 0.2 & [512, 256] & 0.9052 \\
0.0001 & 0.2 & [512, 256, 256] & 0.9015 \\
0.0001 & 0.2 & [512, 512, 256] & 0.8807 \\
0.0001 & 0.4 & [256, 256] & 0.9 \\
0.0001 & 0.4 & [512, 256] & 0.8933 \\
