<a href="https://colab.research.google.com/github/GajulapalliNagaVyshnavi/DockFilterHub/blob/main/Docking_filtering_criteria_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
mkdir input_ligand_files


In [None]:
mkdir input_protein_files

**Ligand Diversity**

In [None]:
import os
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, DataStructs

def compute_tanimoto(fp1, fp2):
    """
    Compute Tanimoto similarity between two fingerprints.
    Args:
        fp1, fp2: RDKit fingerprint objects.
    Returns:
        float: Tanimoto similarity score.
    """
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def filter_ligands(folder_path, threshold=0.8):
    """
    Filter ligands based on Tanimoto similarity from a folder.

    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).
        threshold (float): Similarity threshold (default 0.8).

    Returns:
        dict: Mapping of ligand file names to True (pass) or False (fail).
    """
    # Get a list of all ligand files in the folder with .mol or .sdf extensions
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]

    results = {}  # Dictionary to store results
    fingerprints = []  # List to store fingerprints of all ligands
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)  # Initialize fingerprint generator

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)  # Read molecule from .mol file
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)  # Read first valid molecule from .sdf file
        else:
            results[ligand] = False  # Skip files that are not .mol or .sdf
            continue

        if not mol:
            results[ligand] = False  # Skip if molecule could not be read
            continue

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fingerprints.append(fp)

    for ligand, fp in zip(ligand_files, fingerprints):
        # Compute maximum Tanimoto similarity with other fingerprints
        max_similarity = max((compute_tanimoto(fp, ref_fp) for ref_fp in fingerprints if ref_fp != fp), default=0)
        results[ligand] = max_similarity < threshold  # Check if similarity is below threshold

    return results

# Example usage:
folder_path = "/content/input_files"  # Path to folder containing ligand files
filter_results = filter_ligands(folder_path)
print(filter_results)  # Print the filtering results


{'/content/input_files/1a0q_ligand.sdf': True}




**Ligand Properties**

In [None]:
import os
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

def check_ligand_properties(mol):
    """
    Check ligand properties for a given molecule.

    Args:
    - mol: RDKit molecule object.

    Returns:
    - bool: True if ligand properties meet the criteria, False otherwise.
    """
    # Extract molecular weight and rotatable bonds
    mol_weight = Descriptors.MolWt(mol)
    rotatable_bonds = Lipinski.NumRotatableBonds(mol)

    # Check atom types by ensuring only allowed atoms are present
    allowed_atoms = {'C', 'N', 'O', 'P', 'S', 'H'}  # Example of allowed atoms; modify as necessary
    for atom in mol.GetAtoms():
        if atom.GetSymbol() not in allowed_atoms:
            return False

    # Apply the filtering criteria
    if (100 <= mol_weight <= 900) and (rotatable_bonds <= 50):
        return True
    else:
        return False

def filter_ligands(folder_path):
    """
    Filter ligands based on specified criteria (MW, rotatable bonds, atom types).

    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).

    Returns:
        dict: Mapping of ligand file names to True (pass) or False (fail).
    """
    # Get a list of all ligand files in the folder with .mol or .sdf extensions
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]

    results = {}  # Dictionary to store results

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)  # Read molecule from .mol file
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)  # Read first valid molecule from .sdf file
        else:
            results[ligand] = False  # Skip files that are not .mol or .sdf
            continue

        if not mol:
            results[ligand] = False  # Skip if molecule could not be read
            continue

        # Check ligand properties based on criteria
        results[ligand] = check_ligand_properties(mol)

    return results

# Example usage:
folder_path = "/content/input_files"  # Path to folder containing .mol and .sdf files
filter_results = filter_ligands(folder_path)
print(filter_results)  # Print the filtering results


{'/content/input_files/1a0q_ligand.sdf': True}




**Ligand Strain Energy**

In [None]:
import os
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, DataStructs
from rdkit.Chem import AllChem

def compute_tanimoto(fp1, fp2):
    """
    Compute Tanimoto similarity between two fingerprints.
    Args:
        fp1, fp2: RDKit fingerprint objects.
    Returns:
        float: Tanimoto similarity score.
    """
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def check_ligand_strain_energy_and_similarity(folder_path, strain_energy_threshold=10.0, similarity_threshold=0.8):
    """
    Filter ligands based on strain energy and Tanimoto similarity.

    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).
        strain_energy_threshold (float): Strain energy threshold (default 10.0 kcal/mol).
        similarity_threshold (float): Tanimoto similarity threshold (default 0.8).

    Returns:
        dict: Mapping of ligand file names to True (pass) or False (fail).
    """
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]
    results = {}  # Dictionary to store results
    fingerprints = []  # List to store fingerprints of all ligands
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)  # Initialize fingerprint generator

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)  # Read molecule from .mol file
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)  # Read first valid molecule from .sdf file
        else:
            results[ligand] = False  # Skip files that are not .mol or .sdf
            continue

        if not mol:
            results[ligand] = False  # Skip if molecule could not be read
            continue

        # Check strain energy (if available)
        strain_energy = mol.GetProp('_StrainEnergy') if mol.HasProp('_StrainEnergy') else None
        if strain_energy and float(strain_energy) > strain_energy_threshold:
            results[ligand] = False  # Skip if strain energy is above threshold
            continue

        # Compute the fingerprint and store it for similarity calculation
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fingerprints.append(fp)

    # Now compute the Tanimoto similarity for each ligand
    for ligand, fp in zip(ligand_files, fingerprints):
        # Compute maximum Tanimoto similarity with other fingerprints
        max_similarity = max((compute_tanimoto(fp, ref_fp) for ref_fp in fingerprints if ref_fp != fp), default=0)
        if max_similarity >= similarity_threshold:
            results[ligand] = False  # Skip if similarity is above threshold

    return results

# Example usage:
folder_path = "/content/input_files"  # Path to folder containing ligand files
filter_results = check_ligand_strain_energy_and_similarity(folder_path)
print(filter_results)  # Print the filtering results


{}




In [None]:
#########NEW
import os
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator, DataStructs
from rdkit.Chem import AllChem

def compute_tanimoto(fp1, fp2):
    """
    Compute Tanimoto similarity between two fingerprints.
    Args:
        fp1, fp2: RDKit fingerprint objects.
    Returns:
        float: Tanimoto similarity score.
    """
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def Ligand_Strain_Energy(folder_path, strain_energy_threshold=10.0, similarity_threshold=0.8):
    """
    Filter ligands based on strain energy and Tanimoto similarity.

    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).
        strain_energy_threshold (float): Strain energy threshold (default 10.0 kcal/mol).
        similarity_threshold (float): Tanimoto similarity threshold (default 0.8).

    Returns:
        bool: True if at least one ligand passes all criteria, False otherwise.
    """
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]
    fingerprints = []
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)
        else:
            continue

        if not mol:
            continue

        strain_energy = mol.GetProp('_StrainEnergy') if mol.HasProp('_StrainEnergy') else None
        if strain_energy and float(strain_energy) > strain_energy_threshold:
            continue

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fingerprints.append(fp)

    for i, fp in enumerate(fingerprints):
        max_similarity = max((compute_tanimoto(fp, ref_fp) for j, ref_fp in enumerate(fingerprints) if i != j), default=0)
        if max_similarity < similarity_threshold:
            return True

    return False

# # Example usage:
# folder_path = "/content/input_files"  # Path to folder containing ligand files
# result = check_ligand_strain_energy_and_similarity(folder_path)
# print(result)


**Protein Quality**

In [None]:
import os
from Bio import PDB

def filter_proteins(folder_path, resolution_threshold=2.5):
    """
    Filter protein structures based on resolution from a folder.

    Args:
        folder_path (str): Path to the folder containing PDB files.
        resolution_threshold (float): Resolution threshold (default 2.5 Å).

    Returns:
        dict: Mapping of PDB file names to True (pass) or False (fail) based on resolution.
    """
    # Get a list of all PDB files in the folder
    pdb_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdb")]

    results = {}  # Dictionary to store results

    for pdb_file in pdb_files:
        try:
            # Parse the PDB file using Biopython's PDB parser
            parser = PDB.PDBParser(QUIET=True)
            structure = parser.get_structure(pdb_file, pdb_file)

            # Extract the resolution from the PDB header
            resolution = structure.header.get('resolution')

            # Check if the resolution is available and meets the threshold
            if resolution and resolution < resolution_threshold:
                results[pdb_file] = True  # Pass
            else:
                results[pdb_file] = False  # Fail
        except Exception as e:
            # In case of error (e.g., unreadable PDB), mark it as False
            results[pdb_file] = False
            print(f"Error processing {pdb_file}: {e}")

    return results

# Example usage
folder_path = "/content/input_protein_files"  # Replace with your folder path
filter_results = filter_proteins(folder_path)
print(filter_results)  # Print the filtering results


{'/content/input_protein_files/1a0q_protein.pdb': False}


**Structural Quality**

In [None]:
import os
from Bio import PDB

def check_structure_quality(pdb_file):
    """
    Check the quality of a PDB file based on resolution, R-factor, and correlation.
    Args:
        pdb_file (str): Path to the PDB file.
    Returns:
        bool: True if the file meets the structural quality criteria, False otherwise.
    """
    parser = PDB.PDBParser(QUIET=True)

    try:
        structure = parser.get_structure('Protein', pdb_file)
    except Exception as e:
        print(f"Error parsing {pdb_file}: {e}")
        return False

    # Retrieve header information like resolution, R-factor, and correlation
    resolution = structure.header.get('Resolution')
    r_factor = structure.header.get('R_factor')
    correlation = structure.header.get('Correlation')  # May need custom logic for this

    # Assuming ligands are non-protein residues (HETATM)
    ligands_complete = check_ligand_completeness(structure)

    # Apply criteria checks
    if (resolution and resolution <= 2.0 and
        r_factor and r_factor <= 0.2 and
        correlation and correlation >= 0.95 and
        ligands_complete):
        return True
    else:
        return False

def check_ligand_completeness(structure):
    """
    Check if ligands are 100% complete in the given structure.
    Args:
        structure (Bio.PDB.Structure.Structure): The PDB structure.
    Returns:
        bool: True if ligands are complete, False otherwise.
    """
    ligands = []
    for model in structure:
        for chain in model:
            for residue in chain:
                if residue.id[0] != " ":  # Non-standard residue (likely a ligand)
                    ligands.append(residue)
    # For simplicity, assume any ligands found mean completeness
    return len(ligands) > 0

def filter_pdb_files(folder_path):
    """
    Filter PDB files in a folder based on structural quality criteria.

    Args:
        folder_path (str): Path to the folder containing PDB files.

    Returns:
        dict: A dictionary mapping PDB file names to True (pass) or False (fail).
    """
    pdb_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".pdb")]

    results = {}  # Dictionary to store results

    for pdb_file in pdb_files:
        results[pdb_file] = check_structure_quality(pdb_file)

    return results

# Example usage:
folder_path = "/content/input_protein_files"  # Replace with actual folder path
filter_results = filter_pdb_files(folder_path)
print(filter_results)  # Print the filtering results (True or False for each PDB file)


{'/content/input_protein_files/1a0q_protein.pdb': False}


**Steric Clashes**

In [None]:
import os
from typing import Dict


def Steric_Clashes(pdb_file_path: str) -> bool:
    """
    Placeholder function to check steric clashes for a given PDB file.
    You can implement your steric clash detection logic here.

    Args:
        pdb_file_path (str): Path to the PDB file.

    Returns:
        bool: True if there are no significant steric clashes, False if there are.
    """
    # Implement steric clash detection logic here.
    # This is a dummy return to demonstrate the logic.
    # Replace with your actual steric clash check.

    # Placeholder: Assuming all files are valid for now
    return True  # Change this based on your actual steric clash detection result


def filter_pdbs_by_steric_clashes(folder_path: str) -> Dict[str, bool]:
    """
    Filters PDB files in the given folder and checks if they meet the steric clash criteria.

    Args:
        folder_path (str): Path to the folder containing PDB files.

    Returns:
        Dict[str, bool]: Mapping of PDB file names to True (no steric clashes) or False (with steric clashes).
    """
    # Get a list of all PDB files in the folder
    pdb_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdb')]

    results = {}  # Dictionary to store results

    for pdb_file in pdb_files:
        is_valid = Steric_Clashes(pdb_file)
        results[pdb_file] = is_valid

    return results


# Example usage:
folder_path = "/content/input_protein_files"  # Path to folder containing PDB files
filter_results = filter_pdbs_by_steric_clashes(folder_path)
print(filter_results)  # Print the filtering results


{'/content/input_protein_files/1a0q_protein.pdb': True}


In [None]:
#########NEW
import os
from typing import List

def check_Steric_Clashes(pdb_file_path: str) -> bool:
    """
    Placeholder function to check steric clashes for a given PDB file.
    You can implement your steric clash detection logic here.

    Args:
        pdb_file_path (str): Path to the PDB file.

    Returns:
        bool: True if there are no significant steric clashes, False if there are.
    """
    return True  # Replace with actual steric clash check

def Steric_Clashes(folder_path: str) -> bool:
    """
    Filters PDB files in the given folder and checks if at least one meets the steric clash criteria.

    Args:
        folder_path (str): Path to the folder containing PDB files.

    Returns:
        bool: True if at least one PDB file passes all criteria, False otherwise.
    """
    pdb_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith('.pdb')]

    for pdb_file in pdb_files:
        if check_Steric_Clashes(pdb_file):
            return True

    return False

# # Example usage:
# folder_path = "/content/input_protein_files"
# result = filter_pdbs_by_steric_clashes(folder_path)
# print(result)


**Covalent Ligands/Artifacts**

In [None]:
#########NEW
import os
from rdkit import Chem
from rdkit.Chem import AllChem

def is_non_covalent_ligand(mol):
    """
    Check if the ligand is non-covalent and biologically relevant.
    Args:
        mol: RDKit molecule object.
    Returns:
        bool: True if non-covalent and relevant, False otherwise.
    """
    for bond in mol.GetBonds():
        if bond.GetBondTypeAsDouble() > 1.0:
            return False  # Covalent bond detected
    return True

def Covalent_Ligands(folder_path):
    """
    Filter ligands to include only non-covalent, biologically relevant ligands.
    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).
    Returns:
        bool: True if at least one ligand passes, False otherwise.
    """
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)
        else:
            continue

        if not mol:
            continue

        if is_non_covalent_ligand(mol):
            return True  # At least one ligand is non-covalent and relevant

    return False  # No valid ligand found

# # Example usage:
# folder_path = "/content/input_ligand_files"
# result = filter_ligands(folder_path)
# print(result)


**Ligand Conformation & Validity**

In [None]:
import os
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
from rdkit.Chem import rdFingerprintGenerator

def compute_tanimoto(fp1, fp2):
    """
    Compute Tanimoto similarity between two fingerprints.
    Args:
        fp1, fp2: RDKit fingerprint objects.
    Returns:
        float: Tanimoto similarity score.
    """
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def filter_ligands(folder_path, threshold=0.8):
    """
    Filter ligands based on Tanimoto similarity from a folder.

    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).
        threshold (float): Similarity threshold (default 0.8).

    Returns:
        dict: Mapping of ligand file names to True (pass) or False (fail).
    """
    # Get a list of all ligand files in the folder with .mol or .sdf extensions
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]

    results = {}  # Dictionary to store results
    fingerprints = []  # List to store fingerprints of all ligands
    generator = rdFingerprintGenerator.GetMorganGenerator(radius=2)  # Initialize fingerprint generator

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)  # Read molecule from .mol file
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)  # Read first valid molecule from .sdf file
        else:
            results[ligand] = False  # Skip files that are not .mol or .sdf
            continue

        if not mol:
            results[ligand] = False  # Skip if molecule could not be read
            continue

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fingerprints.append((ligand, fp))  # Store the fingerprint along with the file name

    for ligand, fp in fingerprints:
        # Compute maximum Tanimoto similarity with other fingerprints
        max_similarity = max((compute_tanimoto(fp, ref_fp) for _, ref_fp in fingerprints if ref_fp != fp), default=0)

        # Check if similarity is below threshold
        if max_similarity < threshold:
            results[ligand] = True  # Ligand passes the filter
        else:
            results[ligand] = False  # Ligand fails the filter

    return results

# Example usage:
folder_path = "/content/input_ligand_files"  # Path to folder containing ligand files
filter_results = filter_ligands(folder_path)
print(filter_results)  # Print the filtering results


{'/content/input_ligand_files/1a0q_ligand.sdf': True}




In [None]:
########NEW
import os
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

def compute_tanimoto(fp1, fp2):
    """
    Compute Tanimoto similarity between two fingerprints.
    Args:
        fp1, fp2: RDKit fingerprint objects.
    Returns:
        float: Tanimoto similarity score.
    """
    return DataStructs.TanimotoSimilarity(fp1, fp2)

def Ligand_Conformation(folder_path, threshold=0.8):
    """
    Filter ligands based on Tanimoto similarity from a folder.

    Args:
        folder_path (str): Path to the folder containing ligand files (.mol or .sdf).
        threshold (float): Similarity threshold (default 0.8).

    Returns:
        bool: True if at least one ligand passes, False otherwise.
    """
    ligand_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".mol") or f.endswith(".sdf")]
    fingerprints = []

    for ligand in ligand_files:
        if ligand.endswith(".mol"):
            mol = Chem.MolFromMolFile(ligand, removeHs=True)
        elif ligand.endswith(".sdf"):
            suppl = Chem.SDMolSupplier(ligand)
            mol = next((m for m in suppl if m), None)
        else:
            continue

        if not mol:
            continue

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
        fingerprints.append(fp)

    for fp in fingerprints:
        max_similarity = max((compute_tanimoto(fp, ref_fp) for ref_fp in fingerprints if ref_fp != fp), default=0)
        if max_similarity < threshold:
            return True  # At least one ligand passes

    return False  # No ligand passes

# # Example usage:
# folder_path = "/content/input_ligand_files"
# result = filter_ligands(folder_path)
# print(result)


In [None]:
import os
import glob
from typing import List, Tuple

def complex_energy_calculation(protein_file: str, ligand_file: str) -> float:
    """
    Placeholder function to calculate the interaction energy between a protein and a ligand.
    This should be replaced with an actual implementation using a molecular docking tool.
    """
    # Implement actual energy calculation here (e.g., using AutoDock, RDKit, or another tool)
    import random  # Placeholder for actual calculation
    return random.uniform(-15, 5)  # Simulated binding energy values

def filter_protein_ligand_interactions(protein_folder: str, ligand_folder: str, threshold: float = -7.0) -> List[Tuple[str, str, bool]]:
    """
    Filters protein-ligand interactions based on complex energy calculations.

    Parameters:
    - protein_folder: Path to the folder containing protein PDB files.
    - ligand_folder: Path to the folder containing ligand MOL/SDF files.
    - threshold: Energy cutoff value for filtering (default: -7.0 kcal/mol).

    Returns:
    - A list of tuples containing (protein file, ligand file, True/False) where True indicates passing the threshold.
    """
    protein_files = glob.glob(os.path.join(protein_folder, "*.pdb"))
    ligand_files = glob.glob(os.path.join(ligand_folder, "*.mol")) + glob.glob(os.path.join(ligand_folder, "*.sdf"))

    results = []
    for protein in protein_files:
        for ligand in ligand_files:
            energy = complex_energy_calculation(protein, ligand)
            results.append((protein, ligand, energy <= threshold))

    return results

# Example usage:
# protein_folder = "path/to/protein_folder"
# ligand_folder = "path/to/ligand_folder"
# filtered_results = filter_protein_ligand_interactions(protein_folder, ligand_folder)
# print(filtered_results)