# Fine-Tuning ChemBERTa


https://github.com/seyonechithrananda/bert-loves-chemistry/blob/master/chemberta/examples/ZINC250K_Transfer_Learning_With_HuggingFace_tox21.ipynb


## Libraries


In [2]:
# !pip install transformers==4.45.2 # 4.46.0 makes pb
# !pip install accelerate -U
# !pip install -U datasets
# !pip install -U huggingface_hub

In [3]:
import os
import shutil
import time

import pickle

import ast
import numpy as np

import wandb

from sklearn.preprocessing import MinMaxScaler

import torch
import torch.nn as nn

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import Trainer, TrainingArguments

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from transformers.modeling_outputs import SequenceClassifierOutput

import datasets
from datasets import load_from_disk
# from datasets import load_metric
# from evaluate import load

import matplotlib.pyplot as plt

from scipy.ndimage import gaussian_filter1d

from huggingface_hub import login

from typing import Optional

2024-11-07 19:44:19.336625: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-11-07 19:44:19.336680: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-11-07 19:44:19.337653: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 19:44:19.343546: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preamble


In [4]:
# 1. Choose your input mode: "smiles" or "selfies" whether to predict from SMILES or SELFIES

INPUTS = "smiles" # "smiles", "selfies"

# 2. Choose training on computed or experimental spectra
DATA_TYPE = "comp" # ["comp", "exp"]

# 3. Choose model
# MODEL_NAME, INPUTS = "ncfrey/ChemGPT-4.7M", "selfies"    # for ChemGPT
MODEL_NAME = "DeepChem/ChemBERTa-5M-MTR"                 # for SMILES / SELFIES
# MODEL_NAME = "seyonec/BPE_SELFIES_PubChem_shard00_160k"  # for SELFIES
MODEL_SUFFIX = MODEL_NAME.split("/")[1]

print(f"Inputs:    {INPUTS}")
print(f"Data type: {DATA_TYPE}")
print(f"Model:     {MODEL_NAME}")

Inputs:    smiles
Data type: comp
Model:     DeepChem/ChemBERTa-5M-MTR


## Parameters

In [5]:
# Model parameters

args = {
    'model_name': MODEL_NAME,
    'output_activation': 'exp',
    'norm_range': (50, 550),
    'dropout': 0.2,
    'activation': nn.ReLU(),
    'ffn_num_layers': 0,          # 0, 1, 3, 5, 10
    'ffn_hidden_dim': 2200,       # hidden dim of the FFN
    'ffn_output_dim': 1801        # output dim of the FFN
        }

# Training parameters

NB_EPOCHS = 5         # 5, 10
BATCH_SIZE = 64       # 16 for SELFIES due to OOM issues # 64 # cf. seyonechithrananda
FINETUNING = False    # Whether to Finetune the model on experimental dataset or not
LOADING_MODE = False  # Load an already trained model

In [6]:
ffn_num_layers = args["ffn_num_layers"]
ffn_hidden_dim = args["ffn_hidden_dim"]

MODEL_CACHE = "/storage/smiles2spec_models"
SPECIFICATIONS = f"{INPUTS}_{DATA_TYPE}_{MODEL_SUFFIX}_FFNN-{ffn_num_layers}-{ffn_hidden_dim}"

RESULTS_FOLDER = os.path.join(MODEL_CACHE, SPECIFICATIONS)
print(f"Results folder: {RESULTS_FOLDER}")

Results folder: /storage/smiles2spec_models/smiles_comp_ChemBERTa-5M-MTR_FFNN-0-2200


## Datasets

In [7]:
# DATASET_FOLDER = "/datasets"
DATASET_FOLDER = "/storage/smiles2spec_data"

In [8]:
if INPUTS == "selfies":
    MODE = "with_selfies_"
elif INPUTS == "smiles":
    MODE = ""

In [9]:
# Use the keep_in_memory=True, since the dataset folder is in read-only

train_dataset = load_from_disk(os.path.join(DATASET_FOLDER, f"train_{MODE}{DATA_TYPE}.hf"), keep_in_memory=True)
val_dataset = load_from_disk(os.path.join(DATASET_FOLDER, f"val_{MODE}{DATA_TYPE}.hf"), keep_in_memory=True)

test_dataset_comp = load_from_disk(os.path.join(DATASET_FOLDER, f"test_{MODE}comp.hf"), keep_in_memory=True)
test_dataset_exp = load_from_disk(os.path.join(DATASET_FOLDER, f"test_{MODE}exp.hf"), keep_in_memory=True)

In [10]:
train_dataset = train_dataset.rename_column("spectrum", "labels")
val_dataset = val_dataset.rename_column("spectrum", "labels")

test_dataset_comp = test_dataset_comp.rename_column("spectrum", "labels")
test_dataset_exp = test_dataset_exp.rename_column("spectrum", "labels")

In [11]:
train_dataset, val_dataset, test_dataset_comp, test_dataset_exp

(Dataset({
     features: ['smiles', 'labels'],
     num_rows: 68404
 }),
 Dataset({
     features: ['smiles', 'labels'],
     num_rows: 8551
 }),
 Dataset({
     features: ['smiles', 'labels'],
     num_rows: 8551
 }),
 Dataset({
     features: ['base', 'shift', 'smiles', 'labels'],
     num_rows: 6000
 }))

## Tokenizer


In [12]:
# Models at https://huggingface.co/DeepChem
#     or at https://huggingface.co/seyonec/ 

HF_TOKEN = "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" # your HF token
login(HF_TOKEN)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir=RESULTS_FOLDER) # for ChemBERTa

if MODEL_NAME.startswith("ncfrey/ChemGPT"):
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [14]:
def tokenize(batch, inputs_type="smiles"):
        
    tokens = tokenizer(batch[inputs_type], 
                       truncation=True, 
                       padding=True, 
                       max_length=512)

    return tokens

In [15]:
train_dataset = train_dataset.map(tokenize, fn_kwargs={"inputs_type": INPUTS}, batched=True)
val_dataset = val_dataset.map(tokenize, fn_kwargs={"inputs_type": INPUTS}, batched=True)

test_dataset_comp = test_dataset_comp.map(tokenize, fn_kwargs={"inputs_type": INPUTS}, batched=True)
test_dataset_exp = test_dataset_exp.map(tokenize, fn_kwargs={"inputs_type": INPUTS}, batched=True)

Map:   0%|          | 0/68404 [00:00<?, ? examples/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/8551 [00:00<?, ? examples/s]

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [16]:
train_dataset, val_dataset, test_dataset_comp, test_dataset_exp

(Dataset({
     features: ['smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 68404
 }),
 Dataset({
     features: ['smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 8551
 }),
 Dataset({
     features: ['smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 8551
 }),
 Dataset({
     features: ['base', 'shift', 'smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 6000
 }))

In [17]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

test_dataset_comp.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset_exp.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [18]:
train_dataset, val_dataset, test_dataset_comp, test_dataset_exp

(Dataset({
     features: ['smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 68404
 }),
 Dataset({
     features: ['smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 8551
 }),
 Dataset({
     features: ['smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 8551
 }),
 Dataset({
     features: ['base', 'shift', 'smiles', 'labels', 'input_ids', 'attention_mask'],
     num_rows: 6000
 }))

In [19]:
tokenizer.decode(train_dataset[0]["input_ids"])

'[CLS]COC(=O)c1ccc(NC(=O)Cn2c(-c3nnc(CC(C)C)o3)cc3ccccc32)cc1[SEP][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]'

## Model


In [22]:
MODEL_NAME, MODEL_SUFFIX

('ncfrey/ChemGPT-4.7M', 'ChemGPT-4.7M')

In [23]:
num_labels = len(train_dataset[0]["labels"])
num_labels

1801

In [22]:
class Smile2Spec(nn.Module):
    """A Smile2Spec model contains a LLM head, followed by a Feed Forward MLP."""
    def __init__(self, args):
        """
        Initializes the Smile2Spec model.
        :param args: argument for building the model."""

        super(Smile2Spec, self).__init__()

        # LLM
        self.LLM = AutoModelForSequenceClassification.from_pretrained(args.get('model_name'))
        
        # Internal params
        if "ChemBERTa" in args.get('model_name'): # my fix XXX
            
            input_dim = self.LLM.classifier.out_proj.out_features
            
        if "ChemGPT" in args.get('model_name'): # my fix XXX
            
            self.LLM.config.pad_token_id = self.LLM.config.eos_token_id
            input_dim = self.LLM.score.out_features
            
        self.output_activation = args.get('output_activation')
        self.norm_range = args.get('norm_range')

        # FFN params
        dropout = nn.Dropout(args.get('dropout'))
        activation = args.get('activation')

        # New classification head
        # First layer        
        if args.get('ffn_num_layers') == 0:
            output_dim = args.get('ffn_output_dim')
        else:
            output_dim = args.get('ffn_hidden_dim')
            
        ffn = [activation, dropout, nn.Linear(input_dim, output_dim)]
        
        # Next layers
        input_dim = args.get('ffn_hidden_dim')
        output_dim = args.get('ffn_hidden_dim')
        
        for l in range(args.get('ffn_num_layers')):

            if l == (args.get('ffn_num_layers') - 1):
                output_dim = args.get('ffn_output_dim')
                
            ffn.extend([activation, dropout, nn.Linear(input_dim, output_dim)])
            
        self.ffn = nn.Sequential(*ffn)

    def forward(self,
                input_ids = None,
                attention_mask = None,
                labels=None):
        """
        Runs the Smile2Spec model on input.
        
        :return: Output of the Smile2Spec model."""

        #Compute LLM output.
        LLM_output = self.LLM(input_ids, 
                              attention_mask=attention_mask).logits # type: ignore

        #Compute ffn output.
        output = self.ffn(LLM_output)

        # Positive value
        if self.output_activation == 'exp':
            output = torch.exp(output)
        if self.output_activation == 'relu':
            f = nn.ReLU()
            output = f(output)

        # Normalization
        if self.norm_range is not None:
            norm_data = output[:, self.norm_range[0]:self.norm_range[1]]
            norm_sum = torch.sum(norm_data, 1)
            norm_sum = torch.unsqueeze(norm_sum, 1)
            output = torch.div(output, norm_sum)

        return output


# √ 1. oder of linear -> ReLU -> Dropout
# √ 2. check other projecting dim!
# √ 3. get projecting dim!
# √ 4. Test learning for 0, 1, and 3
# √ 5. Check the same for ChemGPT
# √ 6. Modify .py file (args -> args_d, remove ffn_input_dim)
# 7. push and run simuls


# # MY ATTEMPT... DOES NOT WORK FOOR NOW!!!
# # Modify the classifier module
# class ModifiedRobertaClassificationHead(nn.Module):
#     def __init__(self, args, original_classifier, num_layers=1):
#         super().__init__()
        
#         self.num_layers = num_layers
        
#         self.original_classifier = original_classifier
#         if self.num_layers == 0:
#             self.original_classifier.out_proj.out_features = args.get("ffn_output_dim")
#         else:
#             self.original_classifier.out_proj.out_features = args.get("ffn_hidden_dim")
        
        
#         # Create a list to hold the layers
#         layers = []
#         # layers.append(nn.ReLU()) # WRONG!!!
        
#         # Add additional hidden layers based on num_layers
#         for l in range(self.num_layers):
#             if self.num_layers > 1 and l < self.num_layers - 1:
#                 layers.append(self.original_classifier.dropout)
#                 layers.append(nn.Linear(args.get("ffn_hidden_dim"), args.get("ffn_hidden_dim")))
#             else:
#                 layers.append(self.original_classifier.dropout)
#                 layers.append(nn.Linear(args.get("ffn_hidden_dim"), args.get("ffn_output_dim")))
#             layers.append(nn.ReLU())

#         # Convert the list of layers to a Sequential model
#         if self.num_layers > 0:
#             self.extended_classifier = nn.Sequential(*layers)

#     def forward(self, x):
        
#         x = self.original_classifier(x)
#         if self.num_layers > 0:
#             x = self.extended_classifier(x) # XXX
        
#         return x


# class Smile2Spec(nn.Module):
#     """A Smile2Spec model contains a LLM head, followed by a Feed Forward MLP."""
    
#     def __init__(self, args):
#         """
#         Initializes the Smile2Spec model.
#         :param args: argument for building the model."""

#         super(Smile2Spec, self).__init__()

#         # # Create LLM head. # xxx old
#         # self.LLM = AutoModelForSequenceClassification.from_pretrained(args.get('model_name'), 
#         #                                                               num_labels=args.get('ffn_output_dim'))
        
#         if args.get('model_name').startswith("ncfrey/ChemGPT"): # xxx
#             self.LLM.config.pad_token_id = self.LLM.config.eos_token_id
        
#         # Create output objects
#         self.output_activation = args.get('output_activation')
#         self.norm_range = args.get('norm_range')

#         # Create FFN params
#         dropout = nn.Dropout(args.get('dropout'))
#         activation = args.get('activation')

#         # Create LLM and FFN layers XXXXX
#         if args.get('ffn_num_layers') == 0:
#             num_labels = args.get('ffn_output_dim')
#         else:
#             num_labels = args.get('ffn_hidden_dim')
        
#         self.LLM = AutoModelForSequenceClassification.from_pretrained(args.get('model_name'), 
#                                                                       num_labels=num_labels)
        
#         # Replace the model's original classifier with the extended one
#         new_classifier = ModifiedRobertaClassificationHead(args, self.LLM.classifier, 
#                                                            num_layers=args.get('ffn_num_layers'))
#         self.LLM.classifier = new_classifier

#     def forward(self,
#                 input_ids = None,
#                 attention_mask = None,
#                 labels = None):
#         """
#         Runs the Smile2Spec model on input.
        
#         :return: Output of the Smile2Spec model."""

#         # Compute LLM output
#         output = self.LLM(input_ids, attention_mask=attention_mask).logits # type: ignore
#         print(output.shape)

# #         # Positive value mapping
# #         if self.output_activation == 'exp':
# #             output = torch.exp(output)
            
# #         if self.output_activation == 'ReLU':
# #             f = nn.ReLU()
# #             output = f(output)

#         # Normalization mapping
#         if self.norm_range is not None:
#             # norm_data = output[:, self.norm_range[0]:self.norm_range[1]] # XXX
#             norm_data = output[:, self.norm_range[0]:self.norm_range[1]]
#             norm_sum = torch.sum(norm_data, 1)
#             norm_sum = torch.unsqueeze(norm_sum, 1)
#             # output = torch.div(output, norm_sum) # XXX
#             output = torch.div(output, norm_sum)

#         return output

In [23]:
class SIDLoss(nn.Module):
    
    def __init__(self):
        super().__init__()

    def forward(self, model_spectra, target_spectra):

        loss = torch.ones_like(target_spectra)

        loss = torch.mul(torch.log(torch.div(model_spectra, target_spectra)), model_spectra) \
                + torch.mul(torch.log(torch.div(target_spectra, model_spectra)), target_spectra)
        
        loss = torch.sum(loss, dim=1)

        return loss.mean()

## Training


In [24]:
# NB_EPOCHS = 5 # 10
# BATCH_SIZE = 64 # 16 for SELFIES due to OOM issues # 64 # cf. seyonechithrananda

In [25]:
# # Model parameters

# args = {
#     'model_name': MODEL_NAME,
#     'output_activation': 'exp',
#     'norm_range': None, # (50, 550),
#     'dropout': 0.2,
#     'activation': nn.ReLU(),
#     'ffn_num_layers': 1, # 0, 3
#     'ffn_hidden_dim': 2200,      # hidden dim of the FFN
#     'ffn_output_dim': 1801        # output dim of the FFN
#         }

In [26]:
model = Smile2Spec(args)
model

config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/14.0M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-5M-MTR and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Smile2Spec(
  (LLM): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(600, 384, padding_idx=1)
        (position_embeddings): Embedding(515, 384, padding_idx=1)
        (token_type_embeddings): Embedding(1, 384)
        (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.144, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-2): 3 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSdpaSelfAttention(
                (query): Linear(in_features=384, out_features=384, bias=True)
                (key): Linear(in_features=384, out_features=384, bias=True)
                (value): Linear(in_features=384, out_features=384, bias=True)
                (dropout): Dropout(p=0.109, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): L

In [27]:
total_params = sum(p.numel() for p in model.parameters())
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print("Total Params. : ", f"{total_params:,}")
print("Total Trainable Params. : ", f"{total_trainable_params:,}")

Total Params. :  6,749,457
Total Trainable Params. :  6,749,457


In [28]:
# New loss will be implemented here

class CustomTrainer(Trainer):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
   
    def compute_loss(self, model, inputs, return_outputs=False):
        
        labels = inputs.get("labels")
        outputs = model(**inputs)
        loss_fct = SIDLoss()
        loss = loss_fct(outputs, labels)
        
        return (loss, {"label": outputs}) if return_outputs else loss

In [29]:
training_args = TrainingArguments(
    
    # output
    output_dir=RESULTS_FOLDER,          
    
    # params
    num_train_epochs=NB_EPOCHS,               # nb of epochs
    per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
    per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
    learning_rate=5e-5, #2e-5,                # cf. seyonechithrananda / cf. paper Sun et al.
#     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
    warmup_ratio=0.1,                         # cf. paper Sun et al.
    weight_decay=0.01,                        # strength of weight decay
    
    # eval
    eval_strategy="steps",                    # cf. paper Sun et al.
    eval_steps=400,                           # cf. paper Sun et al.
    
    # log
    logging_dir=RESULTS_FOLDER+'logs',  
    logging_strategy='steps',
    logging_steps=400,
    
    # save
    save_strategy='steps',
    save_total_limit=2,
    save_steps=400,                           # save model at every eval (default 500)
    load_best_model_at_end=True,              # cf. paper Sun et al.
    metric_for_best_model='eval_loss',
    # metric_for_best_model='mse', # XXX
    
    report_to="none",                         # "wandb" or "none" to turn wandb off!
    # run_name=f"{model_suffix}",               # name of the W&B run (optional)

    remove_unused_columns=False
)

In [30]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

In [31]:
start_time = time.time()

results = trainer.train()

training_time = time.time() - start_time

[2024-11-07 19:52:26,628] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


RuntimeError: mat1 and mat2 shapes cannot be multiplied (64x199 and 1801x1801)

In [None]:
# save history (loss), trainable params, training time

with open(os.path.join(RESULTS_FOLDER, "log_history.pkl"), "wb") as fh:

    pickle.dump(trainer.state.log_history, fh)
    
    
with open(os.path.join(RESULTS_FOLDER, "training_time.pkl"), "wb") as fh:

    pickle.dump(training_time, fh)
    

with open(os.path.join(RESULTS_FOLDER, "nb_parameters.pkl"), "wb") as fh:
    
    params = {"total_params": total_params, "total_trainable_params" : total_trainable_params}
    
    pickle.dump(params, fh)

In [None]:
# trainer.save_model(RESULTS_FOLDER)

torch.save(model.state_dict(), RESULTS_FOLDER + "/model.pt")

In [None]:
# remove checkpoints since best model saved (saves space)

dirs = os.listdir(RESULTS_FOLDER)
dirs = [d for d in dirs if d.startswith("checkpoint")] # checkpoints dirs

for d in dirs:
    shutil.rmtree(os.path.join(RESULTS_FOLDER, d))

## Fine-Tune (if needed, depends on the task)

In [None]:
# FINETUNING = False # Whether to Finetune the model on experimental dataset or not 

In [None]:
# Fine tune on experimental

if FINETUNING:
    
    #Load dataset
    train_dataset_exp = load_from_disk(os.path.join(DATASET_FOLDER, "train_exp"), keep_in_memory=True)
    val_dataset_exp = load_from_disk(os.path.join(DATASET_FOLDER, "val_exp"), keep_in_memory=True)
    
    #Preprocess Dataset
    train_dataset_exp = train_dataset_exp.rename_column("spectrum", "labels")
    val_dataset_exp = val_dataset_exp.rename_column("spectrum", "labels")
    
    train_dataset_exp = train_dataset_exp.map(tokenize, batched=True)
    val_dataset_exp = val_dataset_exp.map(tokenize, batched=True)
    
    train_dataset_exp.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
    val_dataset_exp.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
if FINETUNING:
    
    training_args_exp = TrainingArguments(
        
        # output
        output_dir=f"/storage/smiles2spec_models/exp/{MODEL_SUFFIX}",          
        
        # params
        num_train_epochs=NB_EPOCHS // 2,               # nb of epochs
        per_device_train_batch_size=BATCH_SIZE,   # batch size per device during training
        per_device_eval_batch_size=BATCH_SIZE,    # cf. paper Sun et al.
        learning_rate=5e-5, #2e-5,                # cf. seyonechithrananda / cf. paper Sun et al.
    #     warmup_steps=500,                         # number of warmup steps for learning rate scheduler
        warmup_ratio=0.1,                         # cf. paper Sun et al.
        weight_decay=0.01,                        # strength of weight decay
        
        # eval
        evaluation_strategy="steps",              # cf. paper Sun et al.
        eval_steps=200,                           # cf. paper Sun et al.
        
        # log
        logging_dir=f"/storage/smiles2spec_models/exp/{MODEL_SUFFIX}"+'logs',  
        logging_strategy='steps',
        logging_steps=200,
        
        # save
        save_strategy='steps',
        save_total_limit=2,
        save_steps=200,                           # save model at every eval (default 500)
        load_best_model_at_end=True,              # cf. paper Sun et al.
        metric_for_best_model='eval_loss',
        # metric_for_best_model='mse', # XXX
        
        report_to="none",                         # "wandb" or "none" to turn wandb off!
        # run_name=f"{model_suffix}",               # name of the W&B run (option   al)

        remove_unused_columns=False
)

In [None]:
if FINETUNING:
    
    trainer_exp = CustomTrainer(
        model=model,
        args=training_args_exp,
        tokenizer=tokenizer,
        train_dataset=train_dataset_exp,
        eval_dataset=val_dataset_exp
    )

In [None]:
if FINETUNING:
    
    trainer_exp.train()

## Results


In [None]:
# load model and results if already trained and evaluated

# LOADING_MODE = False

In [None]:
# compute predictions

predicts_comp = trainer.predict(test_dataset_comp)
predicts_exp = trainer.predict(test_dataset_exp)

test_preds_comp, test_truths_comp = predicts_comp.predictions, predicts_comp.label_ids
test_preds_exp, test_truths_exp = predicts_exp.predictions, predicts_exp.label_ids

In [None]:
# load model if necessary XXX need to plug the weights!!!!!!!

if LOADING_MODE:

    args = {
        'model_name': MODEL_NAME,
        'output_activation': 'exp',
        'norm_range': None, # (50, 550),
        'dropout': 0.2,
        'activation': nn.ReLU(),
        'ffn_num_layers': 1, # 3,
        'ffn_hidden_dim': 2200,
        'ffn_output_dim': 1801
        }


    model = Smile2Spec(args)
    
    model_state_dict = torch.load(os.path.join(RESULTS_FOLDER, "model.pt"))
    model.load_state_dict(model_state_dict)
    
    model.eval()

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )
    
    print("Model loaded.")

In [None]:
test_preds_comp.shape, test_truths_comp.shape, test_preds_exp.shape, test_truths_exp.shape

In [None]:
# save results (if not in loading mode)

if not LOADING_MODE:
    
    torch.save(test_preds_comp, os.path.join(RESULTS_FOLDER,'test_preds_comp.pt'))
    torch.save(test_truths_comp, os.path.join(RESULTS_FOLDER,'test_truths_comp.pt'))

    torch.save(test_preds_exp, os.path.join(RESULTS_FOLDER,'test_preds_exp.pt'))
    torch.save(test_truths_exp, os.path.join(RESULTS_FOLDER,'test_truths_exp.pt'))
    
    print("Predictions saved.")

In [None]:
# load results if necessary
if LOADING_MODE:
    
    test_preds_comp = torch.load(os.path.join(RESULTS_FOLDER,'test_preds_comp.pt'))
    test_truths_comp = torch.load(os.path.join(RESULTS_FOLDER,'test_truths_comp.pt'))

    test_preds_exp = torch.load(os.path.join(RESULTS_FOLDER,'test_preds_exp.pt'))
    test_truths_exp = torch.load(os.path.join(RESULTS_FOLDER,'test_truths_exp.pt'))
    
    print("Predictions loaded.")

In [None]:
def SISScore(predicted_spectrum, true_spectrum):

    # Gaussian Convolution
    predicted_spectrum_conv = gaussian_filter1d(predicted_spectrum, 5)
    true_spectrum_conv = gaussian_filter1d(true_spectrum, 5)

    # Normalization
    predicted_spectrum_conv = nn.functional.normalize(torch.tensor(predicted_spectrum_conv).reshape(1, -1), p=1)
    true_spectrum_conv = nn.functional.normalize(torch.tensor(true_spectrum_conv).reshape(1, -1), p=1)
    
    # Initialize SID Class
    SID = SIDLoss()
    
    sid = SID(predicted_spectrum_conv, true_spectrum_conv)

    return 1/(1+sid)

In [None]:
for i in np.random.choice(8551, 5):
    
    plt.figure(figsize=(14, 5))

    spectrum_truth = test_truths_comp[i, :]
    spectrum_pred = test_preds_comp[i, :]
    spectrum_pred_smooth = gaussian_filter1d(spectrum_pred, 5) # plot smoothed predictions if preferred

    sis_score = SISScore(spectrum_pred, spectrum_truth)

    plt.plot(range(len(spectrum_truth)), spectrum_truth, label="true spectrum", alpha=0.5)
    plt.plot(range(len(spectrum_pred)), spectrum_pred, label=f"predicted spectrum \nSIS Score : {sis_score:.2f}", alpha=0.5)
    
    plt.legend()
    plt.show()

In [None]:
for i in np.random.choice(6000, 5):
    
    plt.figure(figsize=(14, 5))

    spectrum_truth = test_truths_exp[i, :]
    spectrum_pred = test_preds_exp[i, :]
    spectrum_pred_smooth = gaussian_filter1d(spectrum_pred, 5) # plot smoothed predictions if preferred
    
    sis_score = SISScore(spectrum_pred, spectrum_truth)

    plt.plot(range(len(spectrum_truth)), spectrum_truth, label="true spectrum", alpha=0.5)

    plt.plot(range(len(spectrum_pred)), spectrum_pred, label=f"predicted spectrum \nSIS Score : {sis_score:.2f}", alpha=0.5)
    
    plt.legend()
    plt.show()