In [2]:
import os
from rdkit import Chem
from rdkit import RDLogger
import re
import json
RDLogger.DisableLog('rdApp.*')


fun_group_smiles_dictionary = {}

path = "Functional_Groups_Mol"
all_molfiles_list = os.listdir(path)

##########################################################################################
#I have to make an exception for aromatics, alcohol, and carboxylic acids:
aromatic_smarts = "[$([cX3](:*):*),$([cX2+](:*):*)]"
aromatic_nitrogen_smarts = "[$([n](:*):*)]"
aromatic_ketone_smarts = "[$([cX3](:*):*),$([cX2+](:*):*)]CC(=O)"
hydroxyl_smarts = "[#6][OX2H]"
carboxylic_acid_smarts = "[CX3](=O)[OX2H1]"
halogens = ["F", "Cl", "Br", "I"]

aromatic_halogen_smarts = []
for halogen in halogens:
    aromatic_halogen_smarts.append(aromatic_smarts+halogen)
    
fun_group_smiles_dictionary["Aromatic"] = [aromatic_smarts]
fun_group_smiles_dictionary["Aromatic nitrogen"] = [aromatic_nitrogen_smarts]
fun_group_smiles_dictionary["Aromatic halogen"] = aromatic_halogen_smarts
fun_group_smiles_dictionary["Aromatic ether"] = [aromatic_smarts + "OC"]
fun_group_smiles_dictionary["Aromatic ketone"] = [aromatic_ketone_smarts]
fun_group_smiles_dictionary["Hydroxyl"] = [hydroxyl_smarts]
fun_group_smiles_dictionary["Carboxylic acid"] = [carboxylic_acid_smarts]
##########################################################################################


for fun_group_file in all_molfiles_list:
    fun_group_smiles_list = []
    
    #.mol is 4 letters long
    fun_group_name = fun_group_file[0:len(fun_group_file)-4]
    fun_group_path = path + "/" + fun_group_file
    fun_group = Chem.MolFromMolFile(fun_group_path)
    fun_group_smiles = Chem.MolToSmiles(fun_group)
    
    #Each .mol file can have different possible substrusctures
    #These are separated by .
    the_find = re.findall("\.", fun_group_smiles)
    if len(the_find) > 0:
        split_smiles = fun_group_smiles.split(".")
        for i in split_smiles:
            fun_group_smiles_list.append(i)
    else:
        fun_group_smiles_list.append(fun_group_smiles)
        
    fun_group_smiles_dictionary[fun_group_name] = fun_group_smiles_list

    
json_object = json.dumps(fun_group_smiles_dictionary, indent=len(fun_group_smiles_dictionary))
with open("The_machine_proper/SMILES_SMARTS_string.json", "w") as doc:
    doc.write(json_object)
fun_group_smiles_dictionary

{'Aromatic': ['[$([cX3](:*):*),$([cX2+](:*):*)]'],
 'Aromatic nitrogen': ['[$([n](:*):*)]'],
 'Aromatic halogen': ['[$([cX3](:*):*),$([cX2+](:*):*)]F',
  '[$([cX3](:*):*),$([cX2+](:*):*)]Cl',
  '[$([cX3](:*):*),$([cX2+](:*):*)]Br',
  '[$([cX3](:*):*),$([cX2+](:*):*)]I'],
 'Aromatic ether': ['[$([cX3](:*):*),$([cX2+](:*):*)]OC'],
 'Aromatic ketone': ['[$([cX3](:*):*),$([cX2+](:*):*)]CC(=O)'],
 'Hydroxyl': ['[#6][OX2H]'],
 'Carboxylic acid': ['[CX3](=O)[OX2H1]'],
 'Acyl Halide': ['O=CBr', 'O=CCl', 'O=CF', 'O=CI'],
 'Alkane': ['CC'],
 'Alkene': ['C=C'],
 'Alkyne': ['C#C'],
 'Amide': ['NC=O'],
 'Carbonyl': ['C=O'],
 'Ester': ['O=CO'],
 'Imine': ['C=N'],
 'Nitrile': ['C#N'],
 'Nitro': ['N=O'],
 'Non-aromatic Ether': ['COC'],
 'Non-aromatic Halide': ['CBr', 'CCl', 'CF', 'CI'],
 'Non-aromatic Ketone': ['CC(C)=O'],
 'Sulphur oxide': ['O=S'],
 'Sulphur': ['S']}

In [3]:
#Testing:

molecule_smile_string = "c1ccccc1"
fun_group_smiles_list = ['[$([cX3](:*):*),$([cX2+](:*):*)]']

matched = False
for functional_group_smile_string in fun_group_smiles_list:
    molecule = Chem.MolFromSmiles(molecule_smile_string)
    try:
        functional_group = Chem.MolFromSmarts(functional_group_smile_string)
    except:
        functional_group = Chem.MolFromSmiles(functional_group_smile_string)
        
    match = molecule.HasSubstructMatch(functional_group)
    if match is True:
        matched = True
    
if matched is True:
    print("matches")
else:
    print("doesn't match")

matches
