In [1]:
#Load the packages
import torch
import torch.nn as nn
from lightning.pytorch import Trainer #https://lightning.ai/docs/pytorch/stable/common/trainer.html
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping
from lightning.pytorch.loggers import TensorBoardLogger
from maldi_zsl_edit.data import MALDITOFDataModule
from maldi_zsl_edit.models import ZSLClassifier
import h5py
import numpy as np

In [2]:
torch.cuda.empty_cache() 

h5spectra = h5py.File("../Data/zsl_binned_new.h5t", "r") # The old version has problems on the split
h5spectra.visititems(print)
#There are train: 463, val :160 and test: 165 

0 <HDF5 group "/0" (11 members)>
0/intensity <HDF5 dataset "intensity": shape (29105, 6000), type "<f4">
0/split_0 <HDF5 dataset "split_0": shape (29105,), type "|S11">
0/split_1 <HDF5 dataset "split_1": shape (29105,), type "|S11">
0/split_2 <HDF5 dataset "split_2": shape (29105,), type "|S11">
0/split_3 <HDF5 dataset "split_3": shape (29105,), type "|S11">
0/split_4 <HDF5 dataset "split_4": shape (29105,), type "|S11">
0/split_5 <HDF5 dataset "split_5": shape (29105,), type "|S11">
0/split_6 <HDF5 dataset "split_6": shape (29105,), type "|S11">
0/split_7 <HDF5 dataset "split_7": shape (29105,), type "|S11">
0/split_8 <HDF5 dataset "split_8": shape (29105,), type "|S11">
0/split_9 <HDF5 dataset "split_9": shape (29105,), type "|S11">
1 <HDF5 group "/1" (4 members)>
1/strain_names <HDF5 dataset "strain_names": shape (788,), type "|S124">
1/strain_seq <HDF5 dataset "strain_seq": shape (788,), type "|S1676">
1/strain_seq_aligned <HDF5 dataset "strain_seq_aligned": shape (788, 2236), type

In [3]:
#What to tune
batch_size = 16
dim_emb = 788
lr=1e-4

In [4]:
#Load the data set
dm = MALDITOFDataModule( #Personalized lightning data modules
    "../Data/zsl_binned_new.h5t", #The old has problems on split
    zsl_mode = True, # False: multi-class CLF, True: ZSL
    split_index = 0, # independent train-val-test split numbered 0-9
    batch_size = 16, # important hyperparameter
    n_workers = 2, # you can leave this always if you are not CPU limited
    in_memory = True, # you can leave this always if memory is no problem
    )

dm.setup(None)
batch = next(iter(dm.train_dataloader()))
batch.keys()

    Found GPU2 NVIDIA Tesla K40c which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    


dict_keys(['intensity', 'mz', 'group', 'strain', 'seq', 'seq_names', 'seq_ohe'])

## Training

In [5]:
#Now there should be a batch instance ["seq_ohe]", replace the batch["seq"] with it in the models file
n_species = 788 #batch['strain'].shape[0] #Number the seq considered for the train, #The batch should be 623 (463 of training and 160 of val, the rest 165 are on test)
#To add the correct number of n_species you need to correct the sequences seen per batch
dim_emb = 788
model = ZSLClassifier(
    mlp_kwargs = { #specify the parameters to buld the MLP ()
        'n_inputs' : 6000, #Bins of the spectra
        'emb_dim' : dim_emb, #This is the output of the branch
        'layer_dims': [512, 256],
        'layer_or_batchnorm' : "layer",
        'dropout' : 0.2,
    },
    cnn_kwargs= { #specify the parameters to buld the CNN ()
        'vocab_size' : 6, #Number of words, in this case is 5 as (A,T,C,G,-)
        'emb_dim' : dim_emb, #This is the output of the branch
        'conv_sizes' : [32, 64, 128], #[32, 64, 128] Out chanels of the convolutions #On the nlp mode the first is an embeding dimension
        'hidden_sizes' :[512, 256], #MLP: [512, 256]. If false then goes directly from conv to embeding layer
        #IMPORTANT: The models for classification have first the convolution and then a MLP, consider to also add the MLP in the model
        #Note: The first hidden state is the embedding dim of the seq language processing and need to be optimized
        #Note2: The last is the embedding dim for the shared space and score function
        'blocks_per_stage' : 2, #How many residual blocks are applied before the pooling
        'kernel_size' : 3,
        #Stride?
        'dropout' : 0.2,
        'nlp' : False #Move directly to the branch
    },
    n_classes = n_species,
    lr=1e-4, # important to tune
    weight_decay=0, # this you can keep constant
    lr_decay_factor=1.00, # this you can keep constant
    warmup_steps=250, # this you can keep constant
    #nlp = False #Try
)

In [6]:
#Save and monitor training with tensor board
val_ckpt = ModelCheckpoint(monitor="val_acc", mode="max")
callbacks = [val_ckpt, EarlyStopping(monitor="val_acc", patience=10, mode="max")]
logger = TensorBoardLogger("../logs_folder", name="zsl_train_try") # Ctrl+Shift+P # Main folder where the training is saved and the name for the training

#Training specification
trainer = Trainer(
    max_epochs = 100, 
    accelerator='gpu', 
    strategy='auto',
    callbacks=callbacks,
    logger=logger,
    devices=[0]) #You can define epochs and training devices (look on documentation)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [7]:
#Start training
print("\n--- Start training ---\n")
trainer.fit(model, dm.train_dataloader(), dm.val_dataloader()) #Important: normally you can use only dm, but here we specify as the dim of a are different for train and val 
#Note: The model object specify what is considered an input values and what is considered an input/output value during the training on the training step method


--- Start training ---



2024-07-12 19:16:01.096649: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2]

  | Name              | Type               | Params
---------------------------------------------------------
0 | spectrum_embedder | MLPEmbedding       | 3.6 M 
1 | seq_embedder      | CNNEmbedding       | 484 K 
2 | accuracy          | MulticlassAccuracy | 0     
3 | top5_accuracy     | MulticlassAccuracy | 0     
---------------------------------------------------------
4.1 M     Trainable params
0         Non-trainable params
4.1 M     Total params
16.381    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 0:   0%|          | 0/1054 [00:00<?, ?it/s] 

OutOfMemoryError: CUDA out of memory. Tried to allocate 506.00 MiB. GPU 0 has a total capacty of 10.91 GiB of which 374.25 MiB is free. Process 24414 has 985.00 MiB memory in use. Process 11756 has 859.00 MiB memory in use. Including non-PyTorch memory, this process has 8.74 GiB memory in use. Of the allocated memory 6.56 GiB is allocated by PyTorch, and 1.40 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## Save the model

In [None]:
#Save the model
from datetime import datetime
timenow = datetime.now()
strtime = timenow.strftime("%Y-%m-%d_%H-%M-%S")
sure = True
if sure:
    torch.save(model, f'../SavedModels/ZSLmodel{strtime}.pth')
    print(f"Saved as ZSLmodel{strtime}.pth")

Saved as ZSLmodel2024-07-12_18-23-11.pth


# Test model

In [None]:
#Load the model
sure = False
if sure:
    model = torch.load(f'SavedModels\ZSLmodel{strtime}.pth')

## Get predictions

In [None]:
minibatch = next(iter(dm.val_dataloader()))
print(minibatch["intensity"].shape)
print(minibatch["seq_ohe"].shape) #The batch should be 623 (463 of training and 160 of val, the rest 165 are on test)

torch.Size([16, 6000])
torch.Size([788, 6, 2236])


In [None]:
#Separate the predictions to look their individual accuracies
ev_species = {}
ev_species[b'train'] = []
ev_species[b'val_geni'] = []
ev_species[b'val_spec'] = []
ev_species[b'val_strain'] = []
study = [b'train',b'val_geni',b'val_spec',b'val_strain']
i = 0
for label in h5spectra["0"]["split_0"]:
    if label in study:
        a = h5spectra["central"][i]
        b = np.where(a == True)[0][0]
        ev_species[label].append(b)
    i+=1

In [None]:
#Manual calculation of the predictions
import torch.nn as nn
import torch
y_pred = torch.empty((0,788)) #the second is the number of species
y_real= []
with torch.no_grad():
    for minibatch in iter(dm.val_dataloader()): #On the split said if train, val, etc, 
        y_hat = model(minibatch)
        y_pred = torch.cat((y_pred,y_hat),dim=0)
        y_real+= list(minibatch['strain'])
print(y_pred.shape) #(batch size, total possible species)
y_pred

torch.Size([5941, 788])


tensor([[ -7.4667,  -4.9486,  -4.7093,  ...,   7.5747,  -5.6208,   0.3227],
        [-10.4188, -11.1706,  -7.2875,  ...,   6.1635,  -5.2368,  -3.0342],
        [ -7.1118,  -5.8475,  -0.2893,  ...,  11.3903,  -7.3323,  -3.5839],
        ...,
        [-21.2110, -20.2034, -23.1131,  ..., -46.0813, -40.3614, -34.0153],
        [-14.5026, -13.6092, -15.0301,  ..., -27.7574, -23.8724, -22.1448],
        [-22.3036, -12.1983, -23.7718,  ..., -38.6719, -36.9044, -31.5572]])

## Labels for Multilevel evaluation

In [None]:
pred_ind = torch.argmax(y_pred, axis=1)
real_ind = y_real
levels = ["Family", "Genus", "Species", "Strain"]
granularity_lvl = len(levels) 

In [None]:
filos = minibatch["seq_names"]

In [None]:
#Get the multilevel predictions, consider how the data is encoded (genus, species, strain)
ml_real = []
ml_pred = []
for i in range(len(y_real)):
  #for real:
  s_real = filos[real_ind[i]].split(";")
  ml_real.append(s_real)
  #for pred:
  s_pred = filos[pred_ind[i]].split(";")
  ml_pred.append(s_pred)

In [None]:
#Get them on the right format
import numpy as np
ml_real = np.array(ml_real).T
ml_pred = np.array(ml_pred).T
#List for better iteratation
ml_reals = ml_real.tolist()
ml_preds = ml_pred.tolist()

In [None]:
#Get all the possible multilevel labels
ml_level = []
for i in range(len(filos)):
  s_level = filos[i].split(";")
  ml_level.append(s_level)
ml_level = np.array(ml_level).T
ml_levels = ml_level.tolist()

In [None]:
#Total number of labels
for i in range(granularity_lvl):
    n = len(list(set(ml_levels[i])))
    print(f"For {levels[i]} there are {n} different labels")

For Family there are 77 different labels
For Genus there are 162 different labels
For Species there are 679 different labels
For Strain there are 788 different labels


## Accuracy evaluation

In [None]:
from sklearn.metrics import accuracy_score #There is also a torch version, consider it
from sklearn.preprocessing import LabelEncoder
from torchmetrics import Accuracy

#Create a accuracy evaluator
def accu_score(y_true, y_pred, level_lab):
    label_encoder = LabelEncoder()
    y_true_encoded = label_encoder.fit_transform(level_lab)
    y_true_encoded = label_encoder.transform(y_true)
    y_pred_encoded = label_encoder.transform(y_pred)

    #Using sklearn
    #accu = accuracy_score(y_true_encoded, y_pred_encoded, normalize=True) #The normalize True = number of correct predictions, False = fraction of correct predictions
    
    #Using torch
    accu = Accuracy(task="multiclass", num_classes=len(set(level_lab))) 
    accu = accu(torch.tensor(y_pred_encoded), torch.tensor(y_true_encoded))
    
    return accu

In [None]:
# run accu for each level of complexity
accu_levels = []
for level in range(granularity_lvl ):
  accu_levels.append(accu_score(ml_reals[level], ml_preds[level], ml_levels[level]))

In [None]:
# see the results
for i in range(granularity_lvl ):
  print(f"For the level {levels[i]} the accu score is: {accu_levels[i]}") 

For the level Family the accu score is: 0.6377714276313782
For the level Genus the accu score is: 0.5886214375495911
For the level Species the accu score is: 0.19104528427124023
For the level Strain the accu score is: 0.04022891819477081


## F1 evaluation

In [None]:
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from torchmetrics import F1Score

#Create an F1 evaluator
def f1_macro_score(y_true, y_pred, level_lab): #micro average is basically accuracy
    label_encoder = LabelEncoder()
    y_true_encoded = label_encoder.fit_transform(level_lab)
    y_true_encoded = label_encoder.transform(y_true)
    y_pred_encoded = label_encoder.transform(y_pred)

    #Using sklearn
    #f1_scores = f1_score(y_true_encoded, y_pred_encoded, average=None)
    #macro_f1 = sum(f1_scores) / len(f1_scores)

    #Using torch
    macro_f1 = F1Score(task="multiclass", num_classes=len(set(level_lab)), average='macro') 
    macro_f1 = macro_f1(torch.tensor(y_pred_encoded), torch.tensor(y_true_encoded))

    return macro_f1

In [None]:
# run f1_macro_score for each level of complexity
F1_levels = []
for level in range(granularity_lvl ):
  F1_levels.append(f1_macro_score(ml_reals[level], ml_preds[level], ml_levels[level]))

In [None]:
# see the results
for i in range(granularity_lvl ):
  print(f"For the level {levels[i]} the F1 score is: {F1_levels[i]}") #The predictions are no the same as the output, maybe F1 is not used there

For the level Family the F1 score is: 0.15699416399002075
For the level Genus the F1 score is: 0.1215033307671547
For the level Species the F1 score is: 0.048142191022634506
For the level Strain the F1 score is: 0.013832748867571354
