In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import re
import numpy as np
import torch
import functools
from multiprocessing import Pool

from src.ForceFields.GAFF.ReadPara import read_GAFF2_dat
from src.ForceFields.GAFF.Compute_Energy_Force import ComputeEnergyForce
from src.GB_FFs.ParameterGenerator_ChargeTransfer_Model import OptimizedPara
from src.SPICE.ReadMol import para_process_mol, construct_dataloader
from src.SPICE.FineTuning import Test_SPICE,Train_SPICE

GB-FFs GAFF:   bond_harmonic=True,  bond_morse=False, angle_ub=False

GB-FFs MORSE:  bond_harmonic=False, bond_morse=False, angle_ub=False

GB-FFs UB:     bond_harmonic=False, bond_morse=True,  angle_ub=True

In [None]:
# Prepare to process molecules 
bond_harmonic = True # [True, False] whether use the Harmonic function for bond energy
bond_morse = False # [True, False] whether use the full Morse function for bond energy
angle_ub = False # [True, False] whether ues urey-brandley term. We use E = epsilon((r_UB / r)^2 - 1)^2 instead of harmonic fucntion
nb_cpu = 30 # number of cpu to use
nb_time_processing = 20 # one cpu will process 'nb_time_processing' molecules at a time

bool_force = True # [True, False] whether consider force.
dataset = 'SPICE'
GAFF = read_GAFF2_dat(bond_morse, angle_ub) # read GAFF parameters from 'GAFF2.dat'
max_len = 100 # maximum number of atoms in compound, 100 is ok

Pre-training models:

1.ANI1-GAFF.pth: GB-FFs GAFF with GAFF charge

2.ANI1-MORSE.pth: GB-FFs MORSE with GAFF charge

3.ANI1-UB.pth: GB-FFs UB with GAFF charge

Fine-tuned models:

1.SPICE-GAFF-embedding.pth: GB-FFs GAFF only fine-tune embedding layers (with AM1-BCC charge)

2.SPICE-GAFF-AM1-BCC.pth: GB-FFs GAFF with AM1-BCC charge

3.SPICE-GAFF.pth: GB-FFs GAFF with GB-FFs charge

4.SPICE-MORSE.pth: GB-FFs MORSE with GB-FFs charge

5.SPICE-UB.pth: GB-FFs UB with GB-FFs charge

In [None]:
model_name = 'SPICE-GAFF.pth' # the model to load
charge_type = 'GB-FFs' # ['AM1-BCC', 'GB-FFs'] choose the type of charges you want to use
d_model = 512      # dimension of model (D_h)
dropout = 0.1      # dropout to eliminate overfitting
num_heads = 16     # number of attention heads in Large layers (N_heads)
nb_layer = 3       # number of Large layers (L)
activation = "SMU" # [relu,leakyrelu,gelu,SMU] the type of activation function (\sigma)
store_name = 'tmp.pth' # the name of saved model. If should be 'None' if you don't want to save the best model
                       # Attention: If store_name = None, the training stage will not be early-stopped!
force_weight = 1   # 1 is ok (\eta)

Everything is set and you just need to run the following code!!!

In [None]:
# Read the conformations, energies, forces, GAFF parameters...

# ./data/SPICE/MOL.txt stores the train/validation/test molecules in our paper. 
assert charge_type in ['AM1-BCC', 'GB-FFs'], 'Wrong charge type!!!'
mol = []
all_name = []
with open(os.path.join('./data/',dataset,'mol.txt'), 'r') as IndexFile:
    for line in IndexFile.read().splitlines():
        if 'TRAIN' in line:
            pass
        elif 'VALIDATION' in line:
            train_index = len(all_name)
        elif 'TEST' in line:
            val_index = len(all_name)
        else:
            all_name.append(line)

with Pool(processes=nb_cpu) as pool:
    for results in pool.imap(functools.partial(para_process_mol, all_name=all_name, dataset=dataset, GAFF=GAFF, bond_morse=bond_morse, angle_ub=angle_ub, max_len=max_len, nb_time_processing=nb_time_processing),range(int(np.ceil(len(all_name)/nb_time_processing)))):
        i, mol_part = results
        mol += mol_part
        if (i+1) % (10*nb_cpu) == 0:
            print(i+1,'/',int(np.ceil(len(all_name)/nb_time_processing)),' finished')
print('Molecule processing finished!')

mol_train = mol[:train_index]
mol_val = mol[train_index:val_index]
mol_test = mol[val_index:]

print('Reading conformations...')
train_dataloader = construct_dataloader(mol_train, bool_force, True)
val_dataloader = construct_dataloader(mol_val, bool_force, False)
test_dataloader = construct_dataloader(mol_test, bool_force, False)
print('Dataloaders are generated sucessfully!')

In [None]:
# Generate the models
# 'model' is the GB-FFs model
# 'model2' is to compute energy and force accorfing to FFs parameters and conformations

model = OptimizedPara(d_model=d_model, dropout=dropout, num_heads=num_heads, nb_layer=nb_layer,activation=activation,bond_morse=bond_morse,angle_ub=angle_ub,leakyrelu=0.1)

model.load_state_dict(torch.load('./model/' + model_name))
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

model2 = ComputeEnergyForce(bond_harmonic, bond_morse, angle_ub)
model2 = model2.to("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# You can use the following code to check the loss on test dataset:
Para_Loss, Force_Loss, num_conf, num_mol = Test_SPICE(model, model2, test_dataloader, charge_type, bool_force, angle_ub, print_info=False)
print('After filtering, ', num_mol, ' molecules (', num_conf,' conformations) are left')
print('RMSE for Energy (Kcal/mol) :', Para_Loss[-1])
print('RMSE for Force (Kcal/mol/A):', Force_Loss)

In [None]:
# # Only fine-tune embedding layers
# for name,parameters in model.named_parameters():
#     if 'embedding' in name:
#         parameters.requires_grad = True
#     else:
#         parameters.requires_grad = False
# epochs = 300
# start_lr = 1e-5
# end_lr = 1e-6
# rate = 0.995
# step_size = 250

# model = Train_SPICE(model, model2, train_dataloader, val_dataloader, test_dataloader, charge_type, bool_force, force_weight, angle_ub, epochs, start_lr, end_lr, rate, step_size, store_name)


In [None]:
# Fine-tune on all trainable parameters
for name,parameters in model.named_parameters():
    parameters.requires_grad = True
epochs = 300
start_lr = 1e-5
end_lr = 1e-6
rate = 0.995
step_size = 250

model = Train_SPICE(model, model2, train_dataloader, val_dataloader, test_dataloader, charge_type, bool_force, force_weight, angle_ub, epochs, start_lr, end_lr, rate, step_size, store_name)


In [None]:
# You can use the following code to check the loss on test dataset:
Para_Loss, Force_Loss, num_conf, num_mol = Test_SPICE(model, model2, test_dataloader, charge_type, bool_force, angle_ub, print_info=False)
print('After filtering, ', num_mol, ' molecules (', num_conf,' conformations) are left')
print('RMSE for Energy (Kcal/mol) :', Para_Loss[-1])
print('RMSE for Force (Kcal/mol/A):', Force_Loss)