In [1]:
#Load the packages
#import torch
#import torch.nn as nn
from lightning.pytorch import Trainer #https://lightning.ai/docs/pytorch/stable/common/trainer.html
from lightning.pytorch.callbacks import ModelCheckpoint, EarlyStopping #3
from lightning.pytorch.loggers import TensorBoardLogger #3
from maldi_zsl_edit.data import MALDITOFDataModule #1
from maldi_zsl_edit.models import ZSLClassifier #2
#import h5py
#import numpy as np

In [2]:
#What to tune
batch_size = 16, 32 ,64
dim_emb = 512,788,1014
lr=1e-4,1e-6
dropout = 0.2,0.5
k_n = 3,5,7
c_n = 0,64,128

In [2]:
#Load the data set
dm = MALDITOFDataModule( #Personalized lightning data modules
    "../Data/zsl_binned_new.h5t", #The old has problems on split
    zsl_mode = True, # False: multi-class CLF, True: ZSL
    split_index = 0, # independent train-val-test split numbered 0-9
    batch_size = 16, # important hyperparameter
    n_workers = 2, # you can leave this always if you are not CPU limited
    in_memory = True, # you can leave this always if memory is no problem
    )

dm.setup(None)
#batch = next(iter(dm.train_dataloader()))
#batch.keys()

## Training

In [4]:
#Now there should be a batch instance ["seq_ohe]", replace the batch["seq"] with it in the models file
n_species = 160 #batch['strain'].shape[0] #Number the seq considered for the train, #The batch should be 623 (463 of training and 160 of val, the rest 165 are on test)
t_species = 463
#To add the correct number of n_species you need to correct the sequences seen per batch
dim_emb = 520
model = ZSLClassifier(
    mlp_kwargs = { #specify the parameters to buld the MLP ()
        'n_inputs' : 6000, #Bins of the spectra
        'emb_dim' : dim_emb, #This is the output of the branch
        'layer_dims': [512, 256],
        'layer_or_batchnorm' : "layer",
        'dropout' : 0.2,
    },
    cnn_kwargs= { #specify the parameters to buld the CNN ()
        'vocab_size' : 6, #Number of words, in this case is 5 as (A,T,C,G,-)
        'emb_dim' : dim_emb, #This is the output of the branch
        'conv_sizes' : [64, 128], #[32, 64, 128] Out chanels of the convolutions #On the nlp mode the first is an embeding dimension
        'hidden_sizes' : [0], #MLP: [512, 256]. If [0] then goes directly from conv to embeding layer
        #IMPORTANT: The models for classification have first the convolution and then a MLP, consider to also add the MLP in the model
        #Note: The first hidden state is the embedding dim of the seq language processing and need to be optimized
        #Note2: The last is the embedding dim for the shared space and score function
        'blocks_per_stage' : 2, #How many residual blocks are applied before the pooling
        'kernel_size' : 7,
        #Stride?
        #Max average or non?
        'dropout' : 0.2,
        'nlp' : False #Move directly to the branch
    },
    n_classes = n_species,
    t_classes = t_species,
    lr=1e-4, # important to tune
    weight_decay=0, # this you can keep constant
    lr_decay_factor=1.00, # this you can keep constant
    warmup_steps=250, # this you can keep constant
    #nlp = False #Try
)


In [5]:
#Save and monitor training with tensor board
from datetime import datetime
timenow = datetime.now()
strtime = timenow.strftime("%Y-%m-%d_%H-%M-%S")

val_ckpt = ModelCheckpoint(monitor="val_acc", mode="max")
callbacks = [val_ckpt, EarlyStopping(monitor="val_acc", patience=20, mode="max")]
logger = TensorBoardLogger("../logs_folder", name="zsl_train_try2", version=strtime) # Ctrl+Shift+P # Main folder where the training is saved and the name for the training

#Training specification
trainer = Trainer(
    max_epochs = 100, 
    accelerator='gpu', 
    strategy='auto',
    callbacks=callbacks,
    logger=logger,
    devices=[0]) #You can define epochs and training devices (look on documentation)



GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [6]:
#Load from check point
sure = False
if sure:
    checkpoint = torch.load('../logs_folder/zsl_train_try2/2024-07-22_06-25-16/checkpoints/epoch=56-step=60078.ckpt')
    
    for name, param in checkpoint['state_dict'].items():
        print(f"Key: {name}, Shape: {param.shape}")
    for name, param in model.state_dict().items():
        print(f"Key: {name}, Shape: {param.shape}")
    model.state_dict().keys() == checkpoint['state_dict'].keys()

    model.load_state_dict(checkpoint['state_dict'])


In [7]:
#Start training
trainer.fit(model, dm.train_dataloader(), dm.val_dataloader()) #Important: normally you can use only dm, but here we specify as the dim of a are different for train and val 
#Note: The model object specify what is considered an input values and what is considered an input/output value during the training on the training step method

    Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    
2024-07-27 18:07:12.918863: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name              | Type               | Params
---------------------------------------------------------
0 | spectrum_embedder | MLPEmbedding       | 3.6 M 
1 | seq_embedder      | CNNEmbedding       | 372 K 
2 | accuracy          | MulticlassAccuracy | 0     
3 | accuracy2         | MulticlassAccuracy | 0     
4 | top5_accuracy     | MulticlassAccuracy | 0     
---------------------------------------------------------
4.0 M     Train

Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:00<00:00,  2.02it/s]
Epoch 0 - Train loss: NA, Train accu: 0.0
Epoch 0 - Val loss: 9.7366361618042, Val accu: 0.0

                                                                           

  rank_zero_warn(


Epoch 0: 100%|██████████| 1054/1054 [10:21<00:00,  1.69it/s, v_num=7-12]
Epoch 0 - Train loss: 7.726699352264404, Train accu: 0.002490512328222394
Epoch 0 - Val loss: 5.079351425170898, Val accu: 0.005376344081014395

Epoch 1: 100%|██████████| 1054/1054 [10:26<00:00,  1.68it/s, v_num=7-12]
Epoch 1 - Train loss: 6.337392330169678, Train accu: 0.0024312143214046955
Epoch 1 - Val loss: 5.073244571685791, Val accu: 0.005376344081014395

Epoch 2: 100%|██████████| 1054/1054 [10:27<00:00,  1.68it/s, v_num=7-12]
Epoch 2 - Train loss: 6.252948760986328, Train accu: 0.0020754269789904356
Epoch 2 - Val loss: 5.087851047515869, Val accu: 0.009408602491021156

Epoch 3: 100%|██████████| 1054/1054 [10:30<00:00,  1.67it/s, v_num=7-12]
Epoch 3 - Train loss: 6.207500457763672, Train accu: 0.001541745732538402
Epoch 3 - Val loss: 5.071695327758789, Val accu: 0.005376344081014395

Epoch 4: 100%|██████████| 1054/1054 [10:29<00:00,  1.67it/s, v_num=7-12]
Epoch 4 - Train loss: 6.1932501792907715, Train accu:

In [8]:
#timenow = datetime.now()
#traintime = timenow.strftime("%Y-%m-%d_%H-%M-%S") - strtime
#print(f"The model lasted {traintime} to train")

#Save the model also at the end of the training
from torch import save as torch_save
sure = True
if sure:
    torch_save(model, f'../SavedModels/ZSLmodel{strtime}.pth')
    print(f"Saved as ZSLmodel{strtime}.pth")


Saved as ZSLmodel2024-07-27_18-07-12.pth


# Test model

In [1]:
#Load the model
from torch import load as torch_load
sure = True
if sure:
    model = torch_load('../SavedModels/ZSLmodel2024-07-22_06-25-16.pth')

## Get predictions

In [7]:
minibatch = next(iter(dm.val_dataloader()))
print(minibatch["intensity"].shape)
print(minibatch["seq_ohe"].shape) #The batch should be 623 (463 of training and 160 of val, the rest 165 are on test)

torch.Size([16, 6000])
torch.Size([160, 6, 2236])


In [2]:
from maldi_zsl_edit.utils import ZSL_levels_metrics

#For validation
data_path = "../Data/zsl_binned_new.h5t"
#data_set = dm.val_dataloader()
levels = ["Family", "Genus", "Species", "Strain"]
a,b = ZSL_levels_metrics(data_path,model,levels) #Consider to add a train or val mode and a high batch size inside

--- Getting predictions ---
Working with validation set



    Found GPU1 NVIDIA Tesla K40c which is of cuda capability 3.5.
    PyTorch no longer supports this GPU because it is too old.
    The minimum cuda capability supported by this library is 3.7.
    



--- Multi level evaluation ---
For Family there are 33 different labels
For Genus there are 52 different labels
For Species there are 152 different labels
For Strain there are 160 different labels

--- Calculating Accuracy ---
For the level Family the accu score is: 0.6352465748786926
For the level Genus the accu score is: 0.5973742008209229
For the level Species the accu score is: 0.3475845754146576
For the level Strain the accu score is: 0.3218313455581665

--- Calculating F1 scores ---
For the level Family the F1 score is: 0.3254104256629944
For the level Genus the F1 score is: 0.3524334728717804
For the level Species the F1 score is: 0.26579439640045166
For the level Strain the F1 score is: 0.24963624775409698


In [3]:
#For train
c,d = ZSL_levels_metrics(data_path,model,levels,"Train")

--- Getting predictions ---
Working with train set


--- Multi level evaluation ---
For Family there are 77 different labels
For Genus there are 141 different labels
For Species there are 451 different labels
For Strain there are 463 different labels

--- Calculating Accuracy ---
For the level Family the accu score is: 0.9964399933815002
For the level Genus the accu score is: 0.9960840344429016
For the level Species the accu score is: 0.9781060814857483
For the level Strain the accu score is: 0.9660021066665649

--- Calculating F1 scores ---
For the level Family the F1 score is: 0.9900447726249695
For the level Genus the F1 score is: 0.9922187328338623
For the level Species the F1 score is: 0.978141188621521
For the level Strain the F1 score is: 0.9641870260238647


In [11]:
#Info of the data set
import h5py
h5spectra = h5py.File("../Data/zsl_binned_new.h5t", "r") # The old version has problems on the split
#h5spectra.visititems(print) #See the data
#There are train: 463, val :160 and test: 165 
#torch.cuda.empty_cache() 

In [12]:
#Separate the predictions to look their individual accuracies
import numpy as np
ev_species = {}
ev_species[b'train'] = []
ev_species[b'val_geni'] = []
ev_species[b'val_spec'] = []
ev_species[b'val_strain'] = []
study = [b'train',b'val_geni',b'val_spec',b'val_strain']
i = 0
for label in h5spectra["0"]["split_0"]:
    if label in study:
        a = h5spectra["central"][i]
        b = np.where(a == True)[0][0]
        ev_species[label].append(b)
    i+=1

In [13]:
#Manual calculation of the predictions
import torch.nn as nn
import torch
y_pred = torch.empty((0,160)) #the second is the number of species #Change to 788 or 463 for val vs train
y_real= []
with torch.no_grad():
    for minibatch in iter(dm.val_dataloader()): #On the split said if train, val, etc, 
        y_hat = model(minibatch)
        y_pred = torch.cat((y_pred,y_hat),dim=0)
        y_real+= list(minibatch['strain'])
print(y_pred.shape) #(batch size, total possible species)
y_pred

torch.Size([5941, 160])


tensor([[-26.6596, -24.7468, -27.5153,  ..., -40.4013, -24.6716, -27.3861],
        [-44.8781, -36.4867, -35.9950,  ..., -56.0938, -38.7423, -45.9434],
        [-12.2353, -15.5614, -13.2962,  ..., -22.6843, -12.1105, -21.2666],
        ...,
        [-28.7306, -31.2096, -27.5706,  ..., -46.6892, -49.7223, -28.9330],
        [-52.5257, -53.3985, -47.9212,  ..., -75.5992, -76.6577, -52.0940],
        [-34.3973, -34.1811, -26.4999,  ..., -49.5889, -46.2922, -28.2480]])

## Labels for Multilevel evaluation

In [14]:
pred_ind = torch.argmax(y_pred, axis=1)
real_ind = y_real
levels = ["Family", "Genus", "Species", "Strain"]
granularity_lvl = len(levels) 

In [15]:
filos = minibatch["seq_names"]

In [16]:
#Get the multilevel predictions, consider how the data is encoded (genus, species, strain)
ml_real = []
ml_pred = []
for i in range(len(y_real)):
  #for real:
  s_real = filos[real_ind[i]].split(";")
  ml_real.append(s_real)
  #for pred:
  s_pred = filos[pred_ind[i]].split(";")
  ml_pred.append(s_pred)

In [17]:
#Get them on the right format
import numpy as np
ml_real = np.array(ml_real).T
ml_pred = np.array(ml_pred).T
#List for better iteratation
ml_reals = ml_real.tolist()
ml_preds = ml_pred.tolist()

In [18]:
#Get all the possible multilevel labels
ml_level = []
for i in range(len(filos)):
  s_level = filos[i].split(";")
  ml_level.append(s_level)
ml_level = np.array(ml_level).T
ml_levels = ml_level.tolist()

In [19]:
#Total number of labels
for i in range(granularity_lvl):
    n = len(list(set(ml_levels[i])))
    print(f"For {levels[i]} there are {n} different labels")

For Family there are 33 different labels
For Genus there are 52 different labels
For Species there are 152 different labels
For Strain there are 160 different labels


## Accuracy evaluation

In [20]:
#from sklearn.metrics import accuracy_score #There is also a torch version, consider it
from sklearn.preprocessing import LabelEncoder
from torchmetrics import Accuracy

#Create a accuracy evaluator
def accu_score(y_true, y_pred, level_lab):
    label_encoder = LabelEncoder()
    y_true_encoded = label_encoder.fit_transform(level_lab)
    y_true_encoded = label_encoder.transform(y_true)
    y_pred_encoded = label_encoder.transform(y_pred)

    #Using sklearn
    #accu = accuracy_score(y_true_encoded, y_pred_encoded, normalize=True) #The normalize True = number of correct predictions, False = fraction of correct predictions
    
    #Using torch
    accu = Accuracy(task="multiclass", num_classes=len(set(level_lab))) 
    accu = accu(torch.tensor(y_pred_encoded), torch.tensor(y_true_encoded))
    
    return accu

In [21]:
# run accu for each level of complexity
accu_levels = []
for level in range(granularity_lvl ):
  accu_levels.append(accu_score(ml_reals[level], ml_preds[level], ml_levels[level]))

In [22]:
# see the results
for i in range(granularity_lvl ):
  print(f"For the level {levels[i]} the accu score is: {accu_levels[i]}") 

For the level Family the accu score is: 0.6105032563209534
For the level Genus the accu score is: 0.5556303858757019
For the level Species the accu score is: 0.2676317095756531
For the level Strain the accu score is: 0.2420467883348465


## F1 evaluation

In [23]:
#from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
from torchmetrics import F1Score

#Create an F1 evaluator
def f1_macro_score(y_true, y_pred, level_lab): #micro average is basically accuracy
    label_encoder = LabelEncoder()
    y_true_encoded = label_encoder.fit_transform(level_lab)
    y_true_encoded = label_encoder.transform(y_true)
    y_pred_encoded = label_encoder.transform(y_pred)

    #Using sklearn
    #f1_scores = f1_score(y_true_encoded, y_pred_encoded, average=None)
    #macro_f1 = sum(f1_scores) / len(f1_scores)

    #Using torch
    macro_f1 = F1Score(task="multiclass", num_classes=len(set(level_lab)), average='macro') 
    macro_f1 = macro_f1(torch.tensor(y_pred_encoded), torch.tensor(y_true_encoded))

    return macro_f1

In [24]:
# run f1_macro_score for each level of complexity
F1_levels = []
for level in range(granularity_lvl ):
  F1_levels.append(f1_macro_score(ml_reals[level], ml_preds[level], ml_levels[level]))

In [25]:
# see the results
for i in range(granularity_lvl ):
  print(f"For the level {levels[i]} the F1 score is: {F1_levels[i]}") #The predictions are no the same as the output, maybe F1 is not used there

For the level Family the F1 score is: 0.2802780866622925
For the level Genus the F1 score is: 0.2948206663131714
For the level Species the F1 score is: 0.19715124368667603
For the level Strain the F1 score is: 0.1872057020664215
