In [None]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import re
import numpy as np
import torch
import functools
from multiprocessing import Pool

from src.ForceFields.GAFF.ReadPara import read_GAFF2_dat
from src.ForceFields.GAFF.Compute_Energy_Force import ComputeEnergyForce
from src.GB_FFs.ParameterGenerator_ChargeTransfer_Model import OptimizedPara
from src.PreTraining.ReadMol import para_process_mol, construct_dataloader
from src.PreTraining.PreTraining import Test_ANI1,Train_ANI1,Train_para, Train_GAFF

GB-FFs GAFF:   bond_morse=False, angle_ub=False

GB-FFs MORSE:  bond_morse=True, angle_ub=False

GB-FFs UB:     bond_morse=True,  angle_ub=True

In [None]:
# Prepare to process molecules 
bond_morse = True # [True, False] whether use the full Morse function for bond energy
bool_ub = False # [True, False] whether ues urey-brandley term. We use E = epsilon((r_UB / r)^2 - 1)^2 instead of harmonic fucntion
nb_cpu = 30 # number of cpu to use
nb_time_processing = 20 # one cpu will process 'nb_time_processing' molecules at a time

dataset = 'ANI-1'
GAFF = read_GAFF2_dat(bond_morse, bool_ub) # read GAFF parameters from 'GAFF2.dat'
max_len = 26 # maximum number of atoms in compound, 100 is ok

Pre-training models:

1.ANI1-GAFF.pth: GB-FFs GAFF with GAFF charge

2.ANI1-MORSE.pth: GB-FFs MORSE with GAFF charge

3.ANI1-UB.pth: GB-FFs UB with GAFF charge

In [None]:
model_name = 'PreTraining-MORSE.pth' # the model to load
d_model = 512      # dimension of model (D_h)
dropout = 0.1      # dropout to eliminate overfitting
num_heads = 16     # number of attention heads in Large layers (N_heads)
nb_layer = 3       # number of Large layers (L)
activation = "SMU" # [relu,leakyrelu,gelu,SMU] the type of activation function (\sigma)
store_name = 'tmp.pth' # the name of saved model. If should be 'None' if you don't want to save the best model
                       # Attention: If store_name = None, the training stage will not be early-stopped!

Everything is set and you just need to run the following code!!!

In [None]:
# Read the conformations, energies, forces, GAFF parameters...

# ./data/SPICE/MOL.txt stores the processed molecules. 
mol = []
all_name = []
with open(os.path.join('./data/',dataset,'mol.txt'), 'r') as IndexFile:
    for line in IndexFile.read().splitlines():
        all_name.append(line)
    
with Pool(processes=nb_cpu) as pool:
    for results in pool.imap(functools.partial(para_process_mol, all_name=all_name, dataset=dataset, GAFF=GAFF, bond_morse=bond_morse, bool_ub=bool_ub, max_len=max_len, nb_time_processing=nb_time_processing),range(int(np.ceil(len(all_name)/nb_time_processing)))):
        i, mol_part = results
        mol += mol_part
        if (i+1) % (10*nb_cpu) == 0:
            print(i+1,'/',int(np.ceil(len(all_name)/nb_time_processing)),' finished')
print('Molecule processing finished!')

train_index = int(np.ceil(len(mol) * 0.8))
val_index = int(np.ceil(len(mol) * 0.9))
mol_train = mol[:train_index]
mol_val = mol[train_index:val_index]
mol_test = mol[val_index:]

print('Reading conformations...')
train_dataloader = construct_dataloader(mol_train, True)
val_dataloader = construct_dataloader(mol_val, False)
test_dataloader = construct_dataloader(mol_test, False)
print('Dataloaders are generated sucessfully!')

In [None]:
# Generate the models
# 'model' is the GB-FFs model
# 'model2' is to compute energy and force accorfing to FFs parameters and conformations

model = OptimizedPara(d_model=d_model, dropout=dropout, num_heads=num_heads, nb_layer=nb_layer,activation=activation,leakyrelu=0.1,bool_ub=bool_ub)

model.load_state_dict(torch.load('./model/' + model_name))
model = model.to("cuda" if torch.cuda.is_available() else "cpu")

model2 = ComputeEnergyForce(bond_morse, bool_ub)
model2 = model2.to("cuda" if torch.cuda.is_available() else "cpu")


In [None]:
# The loss is the parameters
for name,parameters in model.named_parameters():
    parameters.requires_grad = True
        
epochs = 5
start_lr = 1e-4
end_lr = 1e-6
rate = 0.99
step_size = 100
model = Train_para(model, model2, train_dataloader, val_dataloader, test_dataloader, epochs, start_lr, end_lr, rate, step_size)
print()

# The loss is the parameters and the single energy (the energy for each interaction, which means, every bond/angle/...)
for name,parameters in model.named_parameters():
    parameters.requires_grad = True
epochs = 5
start_lr = 1e-5
end_lr = 1e-7
rate = 0.99
step_size = 100
target_type = 'single' # single sum
model = Train_GAFF(model, model2, train_dataloader, val_dataloader, test_dataloader, epochs, start_lr, end_lr, rate, step_size, target_type)


In [None]:
# Fine-tune on all trainable parameters
# The loss is the parameters and the total relative energy (the sum of energy for all interactions)
for name,parameters in model.named_parameters():
    parameters.requires_grad = True
epochs = 5
start_lr = 1e-5
end_lr = 1e-7
rate = 0.99
step_size = 100

model = Train_ANI1(model, model2, train_dataloader, val_dataloader, test_dataloader, epochs, start_lr, end_lr, rate, step_size, store_name)


In [None]:
# You can use the following code to check the loss on test dataset:
Para_Loss, num_conf, num_mol = Test_ANI1(model, model2, test_dataloader)
print('After filtering, ', num_mol, ' molecules (', num_conf,'conformations) are left')
print('RMSE for Energy (Kcal/mol) :', Para_Loss[-1])