# Finding MCS(Maximum Common Substrucutres)

## Load Module & Finding MCS algorithm

In [14]:
import pandas as pd
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import AllChem
from collections import defaultdict
from rdkit.Chem import Draw

# Read the Excel file
comp_dat = pd.read_excel('Compound List(DNA Damage).xlsx')

# Get the SMILES list
smiles_list = comp_dat['CPD_SMILES']

# Convert SMILES to RDKit molecular objects
molecules = [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

# Function to generate ECFP fingerprints and count substructures
def count_substructures(molecules, radius=2):
    substructure_counts = defaultdict(int)
    bit_info = defaultdict(list)
    for mol in molecules:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=2048, bitInfo=bit_info)
        for bit_id in bit_info:
            substructure_counts[bit_id] += 1
    return substructure_counts, bit_info

# Count the substructures in the molecules
substructure_counts, bit_info = count_substructures(molecules)

# Sort substructures by frequency
sorted_substructures = sorted(substructure_counts.items(), key=lambda x: x[1], reverse=True)

# Get the top N substructures
N = 5
top_substructures = sorted_substructures[:N]

# Function to convert a bit ID to a SMARTS pattern
def bit_id_to_smarts(molecules, bit_id, radius=2):
    for mol in molecules:
        if bit_id in bit_info:
            atom_ids = bit_info[bit_id][0][0]
            env = Chem.FindAtomEnvironmentOfRadiusN(mol, radius, atom_ids)
            amap = {}
            submol = Chem.PathToSubmol(mol, env, atomMap=amap)
            return Chem.MolToSmarts(submol)
    return None

# Visualize the top N substructures
for bit_id, count in top_substructures:
    try:
        smarts = bit_id_to_smarts(molecules, bit_id)
        if smarts:
            submol = Chem.MolFromSmarts(smarts)
            print(f'Substructure (Bit ID: {bit_id}, Count: {count}, SMARTS: {smarts})')
            img = Draw.MolToImage(submol)

            fig, ax = plt.subplots()
            ax.imshow(img)
            ax.axis('off')
            plt.savefig(f'{bit_id}.png', dpi=300)
            plt.close(fig)
        else:
            print(f"SMARTS not found for bit_id {bit_id}")
    except Exception as e:
        print(f"Error with bit_id {bit_id}: {e}")




Substructure (Bit ID: 1380, Count: 101, SMARTS: [#6]-[#8]-[#6@@H](-[#6]-[#6])-[#6@](-[#6])(-[#6])-[#6@@H])
Substructure (Bit ID: 1873, Count: 101, SMARTS: [#8]-[#6@@H](-[#6]-[#6]-[#6@@H])-[#6@])
Substructure (Bit ID: 1057, Count: 95, SMARTS: [#6]-[#6]-[#6])
Substructure (Bit ID: 1750, Count: 94, SMARTS: [#6]-[#6]-[#6@@H](-[#6@H](-[#6])-[#6@@H])-[#6@@](-[#6])(-[#6])-[#6@@H])
Substructure (Bit ID: 807, Count: 93, SMARTS: [#6](-[#6@H](-[#6@])-[#6@H])-[#6]-[#6@])
