In [2]:
import os
from rdkit import Chem
from rdkit.Chem import SDWriter

In [4]:
# Directory containing the SDF files
directory = 'path/toInput/Directory'  # Replace with the correct path

# Dictionary to store unique molecules with their canonical SMILES as the key
unique_molecules = {}
duplicate_molecules = {}

In [6]:
# Iterate over all files in the directory
for filename in os.listdir(directory):
    if filename.endswith(".sdf"):
        file_path = os.path.join(directory, filename)
        
        # Read the SDF file
        supplier = Chem.SDMolSupplier(file_path)
        
        # Iterate over all molecules in the SDF file
        for mol in supplier:
            if mol is not None:
                # Get the canonical SMILES for the molecule
                canonical_smiles = Chem.MolToSmiles(mol, canonical=True)
                
                if canonical_smiles in unique_molecules:
                    # If the molecule is a duplicate, add it to the duplicate list
                    if canonical_smiles not in duplicate_molecules:
                        duplicate_molecules[canonical_smiles] = []
                    duplicate_molecules[canonical_smiles].append((filename, mol))
                else:
                    # Add the file name as a feature to the molecule
                    mol.SetProp("FileName", filename)
                    
                    # Store the unique molecule
                    unique_molecules[canonical_smiles] = mol

# Output file paths
output_sdf = os.path.join(directory, 'unique_molecules.sdf')
duplicates_txt = os.path.join(directory, 'duplicates_molecules.txt')
duplicates_sdf = os.path.join(directory, 'duplicates_molecules.sdf')

# Write the unique molecules to a new SDF file
with SDWriter(output_sdf) as writer:
    for mol in unique_molecules.values():
        writer.write(mol)

# Write duplicate molecules and their file names to the respective files
with open(duplicates_txt, 'w') as txt_writer, SDWriter(duplicates_sdf) as sdf_writer:
    if duplicate_molecules:
        txt_writer.write("Duplicate molecules found:\n")
        for smiles, duplicates in duplicate_molecules.items():
            txt_writer.write(f"Canonical SMILES: {smiles}\n")
            for filename, mol in duplicates:
                txt_writer.write(f" - Found in file: {filename}\n")
                # Write the duplicate molecule to the SDF file
                sdf_writer.write(mol)
    else:
        txt_writer.write("No duplicate molecules found.\n")

print(f"Unique molecules SDF file saved as {output_sdf}")
print(f"Duplicate molecules list saved as {duplicates_txt}")
print(f"Duplicate molecules SDF file saved as {duplicates_sdf}")


FileNotFoundError: [Errno 2] No such file or directory: 'path/toInput/Directory'