In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import Descriptors
import matplotlib.pyplot as plt

from frust.transformers import transformer_ts1
from frust.embedder import embed_ts

from tooltoad.chemutils import ac2mol
from tooltoad.vis import MolTo3DGrid

In [3]:
data = pd.read_csv("../datasets/ir_borylation.csv")
#data = pd.read_csv("../datasets/font_smiles.csv")
smi_list = list(data["smiles"])
smi_list = list(dict.fromkeys(smi_list))
mol_list = [(Chem.MolFromSmiles(smi), smi) for smi in smi_list]

In [None]:
ts_guess_struct="../structures/ts1.xyz"

ts_dict = {}
for i, smi in enumerate(smi_list):
    if i < 1:
        ts_mol = transformer_ts1(
            ligand_smiles=smi,
            ts_guess_struct=ts_guess_struct,
        )
    
        ts_with_smi = {
            name: (mol, idxs, smi)
            for name, (mol, idxs) in ts_mol.items()
        }
        ts_dict.update(ts_with_smi)

In [5]:
embedded_dict = embed_ts(ts_dict, n_confs=20, optimize=True)

Embedded 11 conformers on atom 46
Embedded 13 conformers on atom 47
Embedded 9 conformers on atom 48
Embedded 9 conformers on atom 49


In [6]:
embedded_dict

{'TS(2-methyl-1,3-benzoxazole_rpos(4))': (<rdkit.Chem.rdchem.RWMol at 0x108d13790>,
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
  [10, 11, 39, 40, 41, 46],
  'Cc1nc2ccccc2o1',
  [(3717.674140137694, 0),
   (3808.626717257827, 1),
   (3907.9677158641243, 2),
   (3988.5288811022747, 3),
   (4070.6395598924505, 4),
   (3916.9678075304896, 5),
   (3666.9704845004126, 6),
   (4031.7244411412144, 7),
   (3522.8119199480543, 8),
   (4136.944773492277, 9),
   (3699.0583721824787, 10)]),
 'TS(2-methyl-1,3-benzoxazole_rpos(5))': (<rdkit.Chem.rdchem.RWMol at 0x108d11260>,
  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
  [10, 11, 39, 40, 41, 47],
  'Cc1nc2ccccc2o1',
  [(4454.781050986106, 0),
   (4446.031803292389, 1),
   (4132.893836756433, 2),
   (4418.65796869734, 3),
   (4049.6097848099903, 4),
   (4256.8914666191495, 5),
   (4093.5633867331735, 6),
   (3963.1186190344397, 7),
   (3997.0377984271, 8),
   (4183.595445706907, 9),
   (4158.469834191933, 10),
   (4099.84725267264, 11),
   (3997.407569485

In [None]:
from frust.stepper import Stepper
step = Stepper(smi_list_filt, save_output_dir=False, n_cores=10)
df0 = step.build_initial_df(embedded_dict)

In [None]:
loc = 3
atoms = df0["atoms"].iloc[loc]
coords = df0["coords_embedded"].iloc[loc]
mol1 = ac2mol(atoms, coords)
MolTo3DGrid(mol1, cell_size=(500,500), background_color="lightblue")

In [None]:
df1 = step.xtb(df0, options={"gfnff": None, "opt": None}, constraint=True)

In [None]:
df1

Unnamed: 0,custom_name,ligand_name,rpos,constraint_atoms,cid,smiles,atoms,coords_embedded,energy_uff,xtb-gfnff-opt-electronic_energy,xtb-gfnff-opt-normal_termination,xtb-gfnff-opt-opt_coords
0,"TS(2-methyl-1,3-benzoxazole_rpos(4))","2-methyl-1,3-benzoxazole",4,"[10, 11, 39, 40, 41, 46]",0,Cc1nc2ccccc2o1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(1.9709266478557919, 0.26684339882517655, -0....",3717.674140,-10.006464,True,"[[1.85333701911721, 0.28387862760499, -0.61381..."
1,"TS(2-methyl-1,3-benzoxazole_rpos(4))","2-methyl-1,3-benzoxazole",4,"[10, 11, 39, 40, 41, 46]",1,Cc1nc2ccccc2o1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.5887990644779768, -1.2985535716782028, -1....",3808.626717,-10.006516,True,"[[0.51947122805717, -1.22753938534993, -1.4548..."
2,"TS(2-methyl-1,3-benzoxazole_rpos(4))","2-methyl-1,3-benzoxazole",4,"[10, 11, 39, 40, 41, 46]",2,Cc1nc2ccccc2o1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.1870214869465722, -1.3894754530032902, -1....",3907.967716,-10.004031,True,"[[0.29310822207179, -1.27726984538997, -1.3635..."
3,"TS(2-methyl-1,3-benzoxazole_rpos(4))","2-methyl-1,3-benzoxazole",4,"[10, 11, 39, 40, 41, 46]",3,Cc1nc2ccccc2o1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.2780697554797277, -1.8432762256872808, 0.0...",3988.528881,-10.000185,True,"[[0.26524192004831, -1.67313232349734, 0.12419..."
4,"TS(2-methyl-1,3-benzoxazole_rpos(4))","2-methyl-1,3-benzoxazole",4,"[10, 11, 39, 40, 41, 46]",4,Cc1nc2ccccc2o1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-0.7421866376977833, -1.7295787217896785, 0....",4070.639560,-10.005630,True,"[[-0.62383984329356, -1.63699272816379, 0.1658..."
...,...,...,...,...,...,...,...,...,...,...,...,...
693,"TS(1H-pyrrolo[2,3-c]pyridine_rpos(7))","1H-pyrrolo[2,3-c]pyridine",7,"[10, 11, 39, 40, 41, 49]",3,c1cc2cc[nH]c2cn1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(1.959530183010633, 0.12127710177273465, 0.19...",3471.319375,-9.780451,True,"[[1.87115287030758, 0.14037011032758, 0.130666..."
694,"TS(1H-pyrrolo[2,3-c]pyridine_rpos(7))","1H-pyrrolo[2,3-c]pyridine",7,"[10, 11, 39, 40, 41, 49]",4,c1cc2cc[nH]c2cn1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.11077048527803753, -0.44798850348963376, 1...",3598.136303,-9.782176,True,"[[0.19356444038204, -0.35914141114205, 1.80098..."
695,"TS(1H-pyrrolo[2,3-c]pyridine_rpos(7))","1H-pyrrolo[2,3-c]pyridine",7,"[10, 11, 39, 40, 41, 49]",5,c1cc2cc[nH]c2cn1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.4820208127190513, 0.6655098302461023, -0....",3598.862205,-9.781755,True,"[[-1.33122987044633, 0.62680646650485, -0.4515..."
696,"TS(1H-pyrrolo[2,3-c]pyridine_rpos(7))","1H-pyrrolo[2,3-c]pyridine",7,"[10, 11, 39, 40, 41, 49]",6,c1cc2cc[nH]c2cn1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.6593570498557173, 0.10801577634388504, -0...",3798.803777,-9.781792,True,"[[-1.55678301683929, 0.13638859899777, -0.6265..."


In [None]:
loc = 3
atoms = df1["atoms"].iloc[loc]
coords = df1["xtb-gfnff-opt-opt_coords"].iloc[loc]
mol1 = ac2mol(atoms, coords)
MolTo3DGrid(mol1, cell_size=(500,500), background_color="lightblue")

In [None]:
df2 = step.xtb(df1, options={"gfn": 2})

In [None]:
df2.sort_values(by=['ligand_name', 'rpos', 'xtb-gfn-electronic_energy'], inplace=True)
df2 = df2.groupby(['ligand_name', 'rpos']).head(2)

In [None]:
df3 = step.xtb(df2, options={"gfn": 2, "ohess": True}, constraint=True)

In [None]:
df3.sort_values(by=['ligand_name', 'rpos', 'xtb-gfn-ohess-gibbs_energy'], inplace=True)
df3 = df3.groupby(['ligand_name', 'rpos']).head(1)

In [None]:
df3

Unnamed: 0,custom_name,ligand_name,rpos,constraint_atoms,cid,smiles,atoms,coords_embedded,energy_uff,xtb-gfnff-opt-electronic_energy,xtb-gfnff-opt-normal_termination,xtb-gfnff-opt-opt_coords,xtb-gfn-electronic_energy,xtb-gfn-normal_termination,xtb-gfn-ohess-electronic_energy,xtb-gfn-ohess-gibbs_energy,xtb-gfn-ohess-normal_termination,xtb-gfn-ohess-opt_coords,xtb-gfn-ohess-vibs
487,TS(1H-pyrazole_rpos(0)),1H-pyrazole,0,"[10, 11, 39, 40, 41, 42]",0,c1cn[nH]c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-0.3363545364787292, -1.0986611440262066, -1...",4925.078159,-8.362270,True,"[[-0.24124289308396, -1.07139293151168, -1.072...",-62.767956,True,-62.787559,-62.210217,True,"[[-0.27574711400802, -1.07439859997449, -1.059...","[{'frequency': 36.47}, {'frequency': 69.37}, {..."
496,TS(1H-pyrazole_rpos(1)),1H-pyrazole,1,"[10, 11, 39, 40, 41, 43]",5,c1cn[nH]c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-0.07268044199739145, -0.7046850873860968, -...",5127.584330,-8.375168,True,"[[-0.12181471088119, -0.64395210531125, -1.484...",-62.769640,True,-62.787099,-62.208686,True,"[[-0.09651007538632, -0.63620915917576, -1.465...","[{'frequency': 34.19}, {'frequency': 69.21}, {..."
500,TS(1H-pyrazole_rpos(4)),1H-pyrazole,4,"[10, 11, 39, 40, 41, 46]",2,c1cn[nH]c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.31127341899364624, -1.036439865596074, -1....",4443.780734,-8.378933,True,"[[0.26805450731156, -1.0423776724668, -0.95141...",-62.775276,True,-62.791439,-62.212365,True,"[[0.29637036339281, -1.00972519606813, -0.9574...","[{'frequency': 28.08}, {'frequency': 56.53}, {..."
560,"TS(1H-pyrrolo[2,3-b]pyridine_rpos(0))","1H-pyrrolo[2,3-b]pyridine",0,"[10, 11, 39, 40, 41, 42]",6,c1cnc2[nH]ccc2c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(1.9541912424652044, -0.07991233669673901, -0...",4162.386190,-9.784867,True,"[[1.85488983245536, -0.09050824433637, -0.6346...",-72.377479,True,-72.396451,-71.775053,True,"[[1.86232243373825, -0.05304071834412, -0.6453...","[{'frequency': 26.11}, {'frequency': 35.16}, {..."
567,"TS(1H-pyrrolo[2,3-b]pyridine_rpos(1))","1H-pyrrolo[2,3-b]pyridine",1,"[10, 11, 39, 40, 41, 43]",2,c1cnc2[nH]ccc2c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.7751969578012101, -0.08884308987868071, -...",3620.607028,-9.794048,True,"[[-1.70983036380327, -0.03438857828162, -1.253...",-72.375804,True,-72.397509,-71.775539,True,"[[-1.67678733106929, -0.05697670451432, -1.304...","[{'frequency': 28.19}, {'frequency': 41.58}, {..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,TS(6-fluoro-2-methyl-1H-benzimidazole_rpos(4)),6-fluoro-2-methyl-1H-benzimidazole,4,"[10, 11, 39, 40, 41, 46]",15,Cc1nc2cc(F)ccc2[nH]1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.5774383651890003, 0.2052155652498493, -1....",3721.049363,-10.219770,True,"[[-1.50010355182853, 0.14697210143728, -1.4219...",-79.786657,True,-79.808600,-79.171889,True,"[[-1.48178880456004, 0.17067193730003, -1.3426...","[{'frequency': -6.52}, {'frequency': 50.63}, {..."
476,TS(6-fluoro-2-methyl-1H-benzimidazole_rpos(7)),6-fluoro-2-methyl-1H-benzimidazole,7,"[10, 11, 39, 40, 41, 49]",0,Cc1nc2cc(F)ccc2[nH]1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.8902229584494086, 0.6697723694736558, 1.1...",3882.908578,-10.224738,True,"[[-1.76752664707509, 0.61319254884894, 1.11311...",-79.784262,True,-79.805303,-79.166951,True,"[[-1.80778298314931, 0.55636087355551, 1.16755...","[{'frequency': 21.81}, {'frequency': 33.48}, {..."
449,TS(6-fluoro-2-methyl-1H-benzimidazole_rpos(8)),6-fluoro-2-methyl-1H-benzimidazole,8,"[10, 11, 39, 40, 41, 50]",0,Cc1nc2cc(F)ccc2[nH]1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.2861288399068997, 1.299666918181471, 0.07...",4098.612959,-10.228749,True,"[[-1.17125282980342, 1.24546189357644, 0.05636...",-79.792836,True,-79.810558,-79.172063,True,"[[-1.22482520308521, 1.22905689609943, 0.07218...","[{'frequency': 19.74}, {'frequency': 48.73}, {..."
381,TS(pyrimidine-2-carbonitrile_rpos(4)),pyrimidine-2-carbonitrile,4,"[10, 11, 39, 40, 41, 46]",2,N#Cc1ncccn1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.5187254821651651, 1.5455826902746428, -0.7...",4100.925317,-9.141678,True,"[[0.63811470881203, 1.45636415475136, -0.74604...",-69.416852,True,-69.439257,-68.860687,True,"[[0.59463209862313, 1.45356187187476, -0.75881...","[{'frequency': 19.78}, {'frequency': 49.61}, {..."


In [None]:
loc = 0
atoms = df3["atoms"].iloc[loc]
coords = df3["xtb-gfnff-opt-opt_coords"].iloc[loc]
mol1 = ac2mol(atoms, coords)
MolTo3DGrid(mol1, cell_size=(500,500), background_color="lightblue")

In [None]:
df3

Unnamed: 0,custom_name,ligand_name,rpos,constraint_atoms,cid,smiles,atoms,coords_embedded,energy_uff,xtb-gfnff-opt-electronic_energy,xtb-gfnff-opt-normal_termination,xtb-gfnff-opt-opt_coords,xtb-gfn-electronic_energy,xtb-gfn-normal_termination,xtb-gfn-ohess-electronic_energy,xtb-gfn-ohess-gibbs_energy,xtb-gfn-ohess-normal_termination,xtb-gfn-ohess-opt_coords,xtb-gfn-ohess-vibs
487,TS(1H-pyrazole_rpos(0)),1H-pyrazole,0,"[10, 11, 39, 40, 41, 42]",0,c1cn[nH]c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-0.3363545364787292, -1.0986611440262066, -1...",4925.078159,-8.362270,True,"[[-0.24124289308396, -1.07139293151168, -1.072...",-62.767956,True,-62.787559,-62.210217,True,"[[-0.27574711400802, -1.07439859997449, -1.059...","[{'frequency': 36.47}, {'frequency': 69.37}, {..."
496,TS(1H-pyrazole_rpos(1)),1H-pyrazole,1,"[10, 11, 39, 40, 41, 43]",5,c1cn[nH]c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-0.07268044199739145, -0.7046850873860968, -...",5127.584330,-8.375168,True,"[[-0.12181471088119, -0.64395210531125, -1.484...",-62.769640,True,-62.787099,-62.208686,True,"[[-0.09651007538632, -0.63620915917576, -1.465...","[{'frequency': 34.19}, {'frequency': 69.21}, {..."
500,TS(1H-pyrazole_rpos(4)),1H-pyrazole,4,"[10, 11, 39, 40, 41, 46]",2,c1cn[nH]c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.31127341899364624, -1.036439865596074, -1....",4443.780734,-8.378933,True,"[[0.26805450731156, -1.0423776724668, -0.95141...",-62.775276,True,-62.791439,-62.212365,True,"[[0.29637036339281, -1.00972519606813, -0.9574...","[{'frequency': 28.08}, {'frequency': 56.53}, {..."
560,"TS(1H-pyrrolo[2,3-b]pyridine_rpos(0))","1H-pyrrolo[2,3-b]pyridine",0,"[10, 11, 39, 40, 41, 42]",6,c1cnc2[nH]ccc2c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(1.9541912424652044, -0.07991233669673901, -0...",4162.386190,-9.784867,True,"[[1.85488983245536, -0.09050824433637, -0.6346...",-72.377479,True,-72.396451,-71.775053,True,"[[1.86232243373825, -0.05304071834412, -0.6453...","[{'frequency': 26.11}, {'frequency': 35.16}, {..."
567,"TS(1H-pyrrolo[2,3-b]pyridine_rpos(1))","1H-pyrrolo[2,3-b]pyridine",1,"[10, 11, 39, 40, 41, 43]",2,c1cnc2[nH]ccc2c1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.7751969578012101, -0.08884308987868071, -...",3620.607028,-9.794048,True,"[[-1.70983036380327, -0.03438857828162, -1.253...",-72.375804,True,-72.397509,-71.775539,True,"[[-1.67678733106929, -0.05697670451432, -1.304...","[{'frequency': 28.19}, {'frequency': 41.58}, {..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,TS(6-fluoro-2-methyl-1H-benzimidazole_rpos(4)),6-fluoro-2-methyl-1H-benzimidazole,4,"[10, 11, 39, 40, 41, 46]",15,Cc1nc2cc(F)ccc2[nH]1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.5774383651890003, 0.2052155652498493, -1....",3721.049363,-10.219770,True,"[[-1.50010355182853, 0.14697210143728, -1.4219...",-79.786657,True,-79.808600,-79.171889,True,"[[-1.48178880456004, 0.17067193730003, -1.3426...","[{'frequency': -6.52}, {'frequency': 50.63}, {..."
476,TS(6-fluoro-2-methyl-1H-benzimidazole_rpos(7)),6-fluoro-2-methyl-1H-benzimidazole,7,"[10, 11, 39, 40, 41, 49]",0,Cc1nc2cc(F)ccc2[nH]1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.8902229584494086, 0.6697723694736558, 1.1...",3882.908578,-10.224738,True,"[[-1.76752664707509, 0.61319254884894, 1.11311...",-79.784262,True,-79.805303,-79.166951,True,"[[-1.80778298314931, 0.55636087355551, 1.16755...","[{'frequency': 21.81}, {'frequency': 33.48}, {..."
449,TS(6-fluoro-2-methyl-1H-benzimidazole_rpos(8)),6-fluoro-2-methyl-1H-benzimidazole,8,"[10, 11, 39, 40, 41, 50]",0,Cc1nc2cc(F)ccc2[nH]1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(-1.2861288399068997, 1.299666918181471, 0.07...",4098.612959,-10.228749,True,"[[-1.17125282980342, 1.24546189357644, 0.05636...",-79.792836,True,-79.810558,-79.172063,True,"[[-1.22482520308521, 1.22905689609943, 0.07218...","[{'frequency': 19.74}, {'frequency': 48.73}, {..."
381,TS(pyrimidine-2-carbonitrile_rpos(4)),pyrimidine-2-carbonitrile,4,"[10, 11, 39, 40, 41, 46]",2,N#Cc1ncccn1,"[C, C, C, C, C, C, H, H, H, H, B, N, C, C, C, ...","[(0.5187254821651651, 1.5455826902746428, -0.7...",4100.925317,-9.141678,True,"[[0.63811470881203, 1.45636415475136, -0.74604...",-69.416852,True,-69.439257,-68.860687,True,"[[0.59463209862313, 1.45356187187476, -0.75881...","[{'frequency': 19.78}, {'frequency': 49.61}, {..."


In [None]:
df3.to_parquet("ir_ts_data.parquet")