In [2]:
import os
import shutil
import logging
import tabulate
import cloudpickle as pickle
import pandas as pd
import numpy as np
from functools import partial
import time
from pathlib import Path
import datamol as dm
from IPython.display import clear_output
from tempfile import NamedTemporaryFile, TemporaryDirectory
from itertools import combinations
from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import rdFMCS
import concurrent.futures
from spyrmsd import rmsd, molecule
from espsim import GetEspSim
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN
from pathlib import Path

import oddt
import oddt.fingerprints
import oddt.shape
import oddt.toolkits.rdk

from HandsFreeDocking.Wrapper_Docking import PipelineDocking
from HandsFreeDocking.analysis.clustering import (
    OptimizedDBSCANClustering, PairwiseMatrixComputer, OptimizedKMedoidsClustering,
    calc_rmsd_mcs_with_timeout, calc_usr_similarity, calc_splif
)

from HandsFreeDocking.tools.Protein_Minimization import ProteinMinimizer
from HandsFreeDocking.analysis.clustering import OptimizedHierarchicalClustering

In [4]:
protein_pdb = Path("./examples/LAG3_Moloc_2.pdb")
ligands_sdf = Path("./examples/Ligands_To_Dock.sdf")
cystal_sdf = Path("./examples/Fake_Crystal.sdf")

docking_pkl = Path("./examples/TMP_Docking/docking_results.pkl")
docking_dir = Path("./examples/TMP_Docking")
docking_dir_RXDOCK = Path("./examples/TMP_Docking_RXDOCK")
docking_pkl_RXDOCK = docking_dir_RXDOCK / "docking_results.pkl"

docking_dir_PLANTS = Path("./examples/TMP_Docking_PLANTS")
docking_pkl_PLANTS = docking_dir_PLANTS / "docking_results.pkl"

# Check if all files exist
files_to_check = [protein_pdb, ligands_sdf, cystal_sdf]
all_files_exist = all(file.exists() for file in files_to_check)

RERUN = True

In [None]:
from HandsFreeDocking.RxDock_Pipeline import RxDock_Docking
from HandsFreeDocking.Plants_Pipeline import Plants_Docking

In [ ]:
rxdock_pipeline = RxDock_Docking(docking_dir_RXDOCK, protein_pdb, cystal_sdf, ligands_sdf, protonation_method="oe")
rxdock_pipeline.main(n_poses=10, n_cpus=2)

In [None]:
plants_pipeline = Plants_Docking(docking_dir_PLANTS, protein_pdb, cystal_sdf, ligands_sdf, toolkit="openeye")
plants_pipeline.main(n_confs=10, n_cpus=2)

---

In [5]:
if RERUN == True:
    shutil.rmtree(docking_dir, ignore_errors=True)
    
    # Initialize the docking pipeline
    docking = PipelineDocking(
        workdir=docking_dir,
        docking_software=["rxdock", "plants"],                # Choose one or more: "plants", "gnina", "openeye"
        settings=(10, 4),                                     # (n_conformers, n_cpus)
        protein_pdb=protein_pdb,
        ligands_input=ligands_sdf,                            # Can be SDF or SMILES file
        crystal_sdf=cystal_sdf,
        toolkit="cdpkit"                                     # Choose "cdpkit" or "openeye"
    )

    # Run the docking and get results
    results = docking.run()
    FULL_DF = docking.concat_df()

    with open(docking_pkl, "wb") as f:
        pickle.dump(FULL_DF, f)

    # clear_output()
else:
    with open(docking_pkl, "rb") as f:
        FULL_DF = pickle.load(f)

2025-05-30 17:00:37,955 - HandsFreeDocking.Wrapper_Docking - INFO - Using SDF input directly: examples/Ligands_To_Dock.sdf
2025-05-30 17:00:37,955 - HandsFreeDocking.Wrapper_Docking - INFO - Starting RxDock docking
2025-05-30 17:00:37,956 - HandsFreeDocking.RxDock_Pipeline - INFO - Starting RxDock docking pipeline...
2025-05-30 17:00:37,957 - HandsFreeDocking.RxDock_Pipeline - INFO - Step 1: Sourcing macro (cleaning protein)...
2025-05-30 17:00:38,083 - HandsFreeDocking.RxDock_Pipeline - INFO - Step 2: Preparing protein...


Preparing protein with Chimera ...


2025-05-30 17:00:39,487 - HandsFreeDocking.RxDock_Pipeline - INFO - Step 3: Defining binding site...
2025-05-30 17:00:39,487 - HandsFreeDocking.RxDock_Pipeline - INFO - Creating RxDock parameter file and defining binding site...
2025-05-30 17:00:39,488 - HandsFreeDocking.RxDock_Pipeline - INFO - Running command: rbcavity -W -d -r examples/TMP_Docking/RxDock/rxdock.prm
2025-05-30 17:00:40,015 - HandsFreeDocking.RxDock_Pipeline - INFO - ***********************************************
The RxDock molecular docking program is licensed under GNU LGPL version 3.
RxDock is maintained by Vedran Miletić, Patrik Nikolić, and Luka Vretenar.
Visit https://www.rxdock.org/ for more information.
Executable:	rbcavity/0.1.0
Library:	librxdock.so/0.1.0
RBT_ROOT:	/home/hitesit/Software/rxdock/rxdock_installation
RBT_HOME:	/home/hitesit/Python_Packages/Docking_Pipelines/HandsFreeDocking/examples/TMP_Docking/RxDock
Current dir:	/home/hitesit/Python_Packages/Docking_Pipelines/HandsFreeDocking
Date:		Fri May 

Preparing protein with Chimera ...


2025-05-30 17:00:49,119 - HandsFreeDocking.Plants_Pipeline - INFO - Preparing ligands using cdpkit toolkit
2025-05-30 17:00:49,120 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing Apigenin from SDF
2025-05-30 17:00:49,192 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing 6-methylflavone from SDF
2025-05-30 17:00:49,212 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing Chrysin from SDF
2025-05-30 17:00:49,240 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing Luteolin from SDF
2025-05-30 17:00:49,389 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing Kaempferol from SDF
2025-05-30 17:00:49,519 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing Eriodictyol from SDF
2025-05-30 17:00:49,718 - HandsFreeDocking.tools.Ligand_Preparation - INFO - Processing Fisetin from SDF
2025-05-30 17:00:49,886 - HandsFreeDocking.Plants_Pipeline - INFO - Converting prepared ligands to mol2 format
2025-05-30 17:00:49,905 - Hand

In [None]:
FULL_DF

In [None]:
FULL_DF = FULL_DF[~FULL_DF['ID'].str.contains('L17')]
lig_name_series = FULL_DF["ID"].str.split("_").str[0]

FULL_DF.insert(1, "Lig_Name", lig_name_series)

In [None]:
ALL_MOLS = FULL_DF["Molecule"].tolist()

In [None]:
# Compute the distance matrix using your existing PairwiseMatrixComputer
computer = PairwiseMatrixComputer(ALL_MOLS, n_jobs=8, timeout=60)
rmsd_func = partial(calc_rmsd_mcs_with_timeout, timeout=60)
distance_matrix = computer.compute_matrix(rmsd_func)

In [None]:
# Initialize the hierarchical clustering
clustering = OptimizedHierarchicalClustering(
    linkage_methods=('ward', 'complete', 'average'),
    use_dimensionality_reduction=True,
    verbose=True
)

# Perform hierarchical clustering with automatic parameter optimization
labels = clustering.fit(distance_matrix)

In [None]:
# Extract clusters at different distance thresholds
tight_clusters = clustering.get_clusters_by_distance(0.4)  # More stringent similarity
loose_clusters = clustering.get_clusters_by_distance(1.5)  # More relaxed similarity

# Compare different clustering solutions
print(f"Optimal clustering has {len(np.unique(labels))} clusters")
print(f"Tight clustering has {len(np.unique(tight_clusters))} clusters")
print(f"Loose clustering has {len(np.unique(loose_clusters))} clusters")

# custom_labels = clustering.get_clusters_constrained(
#     distance_threshold=0.4,
#     min_clusters=3,
#     max_clusters=10
# )
# print(f"Custom clustering has {len(np.unique(custom_labels))} clusters")

In [None]:
FULL_DF["Type_3"] = tight_clusters

In [None]:
from pymol import cmd
cmd.reinitialize()
cmd.load(protein_pdb)

def tmp_save(mol: Chem.rdchem.Mol):
    with NamedTemporaryFile(suffix=".sdf", delete=False) as f:
        dm.to_sdf(mol, f.name)
        return f.name

for cluster, df in FULL_DF.groupby("Type_3"):
    IDs = []
    for ndx, sub_df in df.groupby("Lig_Name"):
        sub_df.sort_values(by="Score", ascending=False, inplace=True)
        TOP = sub_df.iloc[0]
        
        ID = TOP["ID"]
        IDs.append(ID)
        TMP_MOL = tmp_save(TOP["Molecule"])
    
        cmd.load(TMP_MOL, ID)
    
    cmd.group(f"Cluster_{cluster}", " ".join(IDs))

cmd.save("TMP.pse")

## Paiwise Calculation

```python
clustering_kmed = OptimizedKMedoidsClustering(
    k_range=(2, 20),
    use_dimensionality_reduction=True,
    verbose=True
)
labels_kmed = clustering_kmed.fit(rmsd_matrix)
results_kmed = clustering_kmed.get_results()
```

```python
clustering_dbscan = OptimizedDBSCANClustering(
    eps_range=(0.5, 5.0, 0.5),
    min_samples_range=(2, 15),
    max_noise_percent=15.0,
    max_clusters = 10,
    use_dimensionality_reduction=True,
    verbose=True
)
labels_dbscan = clustering_dbscan.fit(rmsd_matrix)
results_dbscan = clustering_dbscan.get_results()
```

In [None]:
computer = PairwiseMatrixComputer(ALL_MOLS, n_jobs=8, timeout=60)
rmsd_funct = partial(calc_rmsd_mcs_with_timeout, timeout=60)
rmsd_matrix = computer.compute_matrix(rmsd_funct)

In [None]:
clustering_dbscan = OptimizedDBSCANClustering(
    eps_range=(0.5, 5.0, 0.5),
    min_samples_range=(2, 15),
    max_noise_percent=15.0,
    max_clusters = 10,
    use_dimensionality_reduction=True,
    verbose=True
)
labels_dbscan = clustering_dbscan.fit(rmsd_matrix)
results_dbscan = clustering_dbscan.get_results()

In [None]:
FULL_DF["Cluster_DBSCAN"] = labels_dbscan

In [None]:
from pymol import cmd
cmd.reinitialize()
cmd.load(protein_pdb)

def tmp_save(mol: Chem.rdchem.Mol):
    with NamedTemporaryFile(suffix=".sdf", delete=False) as f:
        dm.to_sdf(mol, f.name)
        return f.name

for cluster, df in FULL_DF.groupby("Cluster_DBSCAN"):
    IDs = []
    for ndx, sub_df in df.groupby("Lig_Name"):
        sub_df.sort_values(by="Score", ascending=False, inplace=True)
        TOP = sub_df.iloc[0]
        
        ID = TOP["ID"]
        IDs.append(ID)
        TMP_MOL = tmp_save(TOP["Molecule"])
    
        cmd.load(TMP_MOL, ID)
    
    cmd.group(f"Cluster_{cluster}", " ".join(IDs))

cmd.save("TMP.pse")

## Protein Minimization

In [None]:
from HandsFreeDocking.tools.Protein_Minimization import ProteinMinimizer

```python
protein_minimizer = ProteinMinimizer(FULL_DF, "Molecule", "PDB_Path")
protein_minimizer(protein_pdb, protein_pdb)
```

In [ ]:
# Note: The ligand preparation has been refactored into a unified module
# All pipelines now use HandsFreeDocking.tools.Ligand_Preparation.LigandPreparator
# which provides a consistent interface for:
# - Protonation (CDPKit, OpenEye, or Scrubber methods)
# - Stereoisomer enumeration
# - Tautomer enumeration (optional)
# - 3D conformation generation
# - Consistent naming: {ligand_name}_Iso{i}_Taut{j}