In [3]:
import os
import csv
import shutil
from rdkit import Chem

def copy_pdb_files(source_folder, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.endswith('_protein.pdb'):
                source_path = os.path.join(root, file)
                destination_path = os.path.join(destination_folder, file)
                destination_path = destination_path.replace('_protein', '')
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file.replace('_protein', '')}")
    print(f"All PDB files have been copied to {destination_folder}")

def copy_ligand_files(source_folder, destination_folder):
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.endswith('_ligand.sdf') or file.endswith('_ligand.mol2'):
                source_path = os.path.join(root, file)
                destination_dir = os.path.join(destination_folder,source_path.split('/')[-2])
                if not os.path.exists(destination_dir):
                    os.makedirs(destination_dir)
                destination_path = os.path.join(destination_dir, file)
                shutil.copy2(source_path, destination_path)
                print(f"Copied: {file}")
            else:
                pass
    print(f"All SDF/MOL2 files have been copied to {destination_folder}")

def generate_csv(input_folder, output_file):
    with open(output_file, 'w', newline='') as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow(['Cleaned_SMILES', 'pdb_id', 'ligand_id'])
        for pdb_id in os.listdir(input_folder):
            subfolder_path = os.path.join(input_folder, pdb_id)
            if os.path.isdir(subfolder_path):
                ligand_file = os.path.join(subfolder_path, f'{pdb_id}_ligand.sdf')
                if os.path.exists(ligand_file):
                    mol = Chem.SDMolSupplier(ligand_file)[0] 
                    if mol is not None:
                        smiles = Chem.MolToSmiles(mol)
                        csvwriter.writerow([smiles, pdb_id, pdb_id])
                    else:
                        print(f"Warning: Could not read molecule from {ligand_file}")
                else:
                    print(f"Warning: ligand.sdf not found in {subfolder_path}")
    print(f"CSV file has been generated: {output_file}")

# eg
source_folder = '/mnt/data/posebusters/posebusters_benchmark_set'           # original set
destination_folder_pdb = '/mnt/data/posebusters_fabind/pdb_files'           # ouput pdb_files
destination_folder_ligand = '/mnt/data/posebusters_fabind/gt_mol_files'     # output gt_mol_files
output_csv_file = '/mnt/data/posebusters_fabind/ligands.csv'                # output csvfile 

copy_ligand_files(source_folder, destination_folder_ligand)
copy_pdb_files(source_folder, destination_folder_pdb)
generate_csv(destination_folder_ligand,output_csv_file)

Copied: 6TW5_9M2_ligand.sdf
Copied: 6TW5_9M2_ligand.mol2
Copied: 8E77_ULP_ligand.mol2
Copied: 8E77_ULP_ligand.sdf
Copied: 7U3J_L6U_ligand.sdf
Copied: 7U3J_L6U_ligand.mol2
Copied: 7M3H_YPV_ligand.sdf
Copied: 7M3H_YPV_ligand.mol2
Copied: 8AEM_LVF_ligand.sdf
Copied: 8AEM_LVF_ligand.mol2
Copied: 7JNB_A2G_ligand.sdf
Copied: 7JNB_A2G_ligand.mol2
Copied: 7AKL_RK5_ligand.sdf
Copied: 7AKL_RK5_ligand.mol2
Copied: 7OKF_VH5_ligand.sdf
Copied: 7OKF_VH5_ligand.mol2
Copied: 7BCP_GCO_ligand.sdf
Copied: 7BCP_GCO_ligand.mol2
Copied: 7WY1_D0L_ligand.sdf
Copied: 7WY1_D0L_ligand.mol2
Copied: 7Q27_8KC_ligand.sdf
Copied: 7Q27_8KC_ligand.mol2
Copied: 8AIE_M7L_ligand.sdf
Copied: 8AIE_M7L_ligand.mol2
Copied: 6YYO_Q1K_ligand.sdf
Copied: 6YYO_Q1K_ligand.mol2
Copied: 7NB4_U6Q_ligand.mol2
Copied: 7NB4_U6Q_ligand.sdf
Copied: 7NGW_UAW_ligand.sdf
Copied: 7NGW_UAW_ligand.mol2
Copied: 8H0M_2EH_ligand.mol2
Copied: 8H0M_2EH_ligand.sdf
Copied: 8EX2_Q2Q_ligand.sdf
Copied: 8EX2_Q2Q_ligand.mol2
Copied: 7UXS_OJC_ligand.mol2
Co