In [34]:
from rdkit import Chem
from rdkit.Chem import rdFMCS
import pandas as pd

In [35]:
def get_MCS_sim(mol1: Chem.Mol, mol2: Chem.Mol) -> float:
    
    result = rdFMCS.FindMCS([mol1, mol2], 
                            timeout=1, 
                            matchValences=True, 
                            matchChiralTag=False,
                            bondCompare=Chem.rdFMCS.BondCompare.CompareOrderExact)  
    
    return result

In [36]:
enzymes_df = pd.read_excel('rule0024_52_enzymes.xlsx')
native_substrates = list(enzymes_df["Substrate SMILES"])

In [37]:
ketohexanoic_acid_smiles = "CC(=O)CCCC(=O)O"
sim_scores = []

for i in range(len(native_substrates)):
    
    try:
        mol1 = Chem.MolFromSmiles(ketohexanoic_acid_smiles)
        mol2 = Chem.MolFromSmiles(native_substrates[i])
        
        MCS_result = get_MCS_sim(mol1 = mol1, mol2 = mol2)
        
        score = MCS_result.numAtoms / (len(mol1.GetAtoms()) + len(mol2.GetAtoms()) - MCS_result.numAtoms)
        
        sim_scores.append(score)
        
    except:
        sim_scores.append(0)
        
enzymes_df["MCS score"] = sim_scores

In [38]:
enzymes_df.sort_values(by = "MCS score", ascending = False)

Unnamed: 0,UNIPROT ID,Substrate CHEBI ID,Substrate SMILES,MCS score
18,P23670,CHEBI:13705,CC(=O)CC(=O)[O-],0.454545
23,P56744,CHEBI:58761,C(C[NH3+])[C@@H](C(=O)[O-])[NH3+],0.416667
36,Q9WYS8,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
3,O66608,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
20,P43850,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
34,Q9RYC6,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
13,P12045,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
37,Q9XAY8,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
24,P74724,CHEBI:137981,C1=C([N+](=CN1)[C@H]2[C@@H]([C@@H]([C@H](O2)CO...,0.272727
22,P55195,CHEBI:77657,C1=NC(=C(N1[C@H]2[C@@H]([C@@H]([C@H](O2)COP(=O...,0.24


In [47]:
len(set(native_substrates))

8

In [46]:
enzymes_df[enzymes_df["Substrate SMILES"]=="CCC1=C(C2=NC1=CC3=C(C4=C(N3)C(=C5[C@H]([C@@H](C(=CC6=NC(=C2)C(=C6C)C=C)N5)C)CCC(=O)[O-])C(=C4[O-])C(=O)OC)C)C"]

Unnamed: 0,UNIPROT ID,Substrate CHEBI ID,Substrate SMILES,MCS score
27,Q2V0W1,CHEBI:58687,CCC1=C(C2=NC1=CC3=C(C4=C(N3)C(=C5[C@H]([C@@H](...,0.152174
