In [13]:
import os
import glob
import pandas as pd
import numpy as np
from scipy.spatial import distance_matrix
import re
from collections import defaultdict
from biopandas.pdb import PandasPdb

def grid_list(atom_df):
    return list(zip(atom_df['x_coord'], atom_df['y_coord'], atom_df['z_coord']))

def load_atom_df(pdb_path):
    ligand_df = PandasPdb().read_pdb(pdb_path)
    ligand_df.df.keys()
    ligand = ligand_df.df['HETATM'] 
    return ligand

def compute_overlap(coords1, coords2, threshold=2.0):
    if len(coords1) == 0 or len(coords2) == 0:
        return 0
    dists = distance_matrix(coords1, coords2)
    overlap1 = (np.min(dists, axis=1) < threshold).sum() / len(coords1)
    overlap2 = (np.min(dists, axis=0) < threshold).sum() / len(coords2)
    return min(overlap1, overlap2)

def process_modes_by_coords(mode_files, overlap_cutoff=0.7):
    keep = []
    for file in mode_files:
        df = load_atom_df(file)
        coords = grid_list(df)
        similar = False
        for kept_file in keep:
            kept_df = load_atom_df(kept_file)
            kept_coords = grid_list(kept_df)
            overlap = compute_overlap(coords, kept_coords)
            if overlap >= overlap_cutoff:
                print(f"Deleting {file} — {overlap*100:.1f}% overlap with {kept_file}")
                os.remove(file)
                similar = True
                break
        if not similar:
            keep.append(file)
    return keep


In [14]:
def get_protein_name(filename):
    basename = os.path.basename(filename)  # Get file name without path
    match = re.match(r'([a-zA-Z0-9]{4})', basename)  # Match the first 4-character PDB ID
    if match:
        return match.group(1).upper()
    else:
        return None

def natural_sort_key(s):
    """Function to sort strings in a natural alphanumeric order."""
    return [int(text) if text.isdigit() else text.lower() for text in re.split(r'(\d+)', s)]

unlabeled_files = glob.glob("CLR-Unlabeled_/*.pdb")
unlabeled_files = sorted(unlabeled_files, key=natural_sort_key)

# Group by protein name
grouped = defaultdict(list)
for file in unlabeled_files:
    protein = get_protein_name(file)
    if protein:
        grouped[protein].append(file)

# Convert to list of lists
list_of_lists = list(grouped.values())

for i, group in enumerate(list_of_lists):
    process_modes_by_coords(group)

Deleting CLR-Unlabeled_/1LRI_protein_vina_out1LRI_mode_2.pdb — 85.1% overlap with CLR-Unlabeled_/1LRI_protein_vina_out1LRI_mode_1.pdb
Deleting CLR-Unlabeled_/1LRI_protein_vina_out1LRI_mode_3.pdb — 83.8% overlap with CLR-Unlabeled_/1LRI_protein_vina_out1LRI_mode_1.pdb
Deleting CLR-Unlabeled_/1LRI_protein_vina_out1LRI_mode_5.pdb — 91.9% overlap with CLR-Unlabeled_/1LRI_protein_vina_out1LRI_mode_1.pdb
Deleting CLR-Unlabeled_/1N83_protein_vina_out1N83_mode_3.pdb — 97.3% overlap with CLR-Unlabeled_/1N83_protein_vina_out1N83_mode_1.pdb
Deleting CLR-Unlabeled_/1N83_protein_vina_out1N83_mode_5.pdb — 70.3% overlap with CLR-Unlabeled_/1N83_protein_vina_out1N83_mode_1.pdb
Deleting CLR-Unlabeled_/2ZXE_protein_vina_out2ZXE_mode_2.pdb — 70.3% overlap with CLR-Unlabeled_/2ZXE_protein_vina_out2ZXE_mode_1.pdb
Deleting CLR-Unlabeled_/2ZXE_protein_vina_out2ZXE_mode_3.pdb — 86.5% overlap with CLR-Unlabeled_/2ZXE_protein_vina_out2ZXE_mode_1.pdb
Deleting CLR-Unlabeled_/2ZXE_protein_vina_out2ZXE_mode_4.pdb —