In [1]:
import os
import shutil
import logging
import tabulate
import cloudpickle as pickle
import pandas as pd
import numpy as np
from functools import partial
import time
from pathlib import Path
import datamol as dm
from IPython.display import clear_output
from tempfile import NamedTemporaryFile, TemporaryDirectory
from itertools import combinations
from joblib import Parallel, delayed
from rdkit import Chem
from rdkit.Chem import rdFMCS
import concurrent.futures
from spyrmsd import rmsd, molecule
from espsim import GetEspSim
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import DBSCAN

import oddt
import oddt.fingerprints
import oddt.shape
import oddt.toolkits.rdk

from HandsFreeDocking.Wrapper_Docking import PipelineDocking
from HandsFreeDocking.analysis.clustering import (
    OptimizedDBSCANClustering, PairwiseMatrixComputer, OptimizedKMedoidsClustering,
    calc_rmsd_mcs_with_timeout, calc_usr_similarity, calc_splif
)

from HandsFreeDocking.tools.Protein_Minimization import ProteinMinimizer

In [2]:
protein_pdb = Path("./examples/LAG3_Moloc_2.pdb")
ligands_sdf = Path("./examples/Ligands_To_Dock.sdf")
cystal_sdf = Path("./examples/Fake_Crystal.sdf")

docking_dir = Path("./examples/TMP_Docking")
docking_pkl = docking_dir / "docking_results.pkl"

# Check if all files exist
files_to_check = [protein_pdb, ligands_sdf, cystal_sdf, docking_dir]
all_files_exist = all(file.exists() for file in files_to_check)

RERUN = False

In [None]:
if RERUN == True:
    # Initialize the docking pipeline
    docking = PipelineDocking(
        workdir=docking_dir,
        docking_software=["plants", "gnina", "openeye"],      # Choose one or more: "plants", "gnina", "openeye"
        settings=(10, 4),                                     # (n_conformers, n_cpus)
        protein_pdb=protein_pdb,
        ligands_input=ligands_sdf,                            # Can be SDF or SMILES file
        crystal_sdf=cystal_sdf,
        toolkit="openeye"                                     # Choose "cdpkit" or "openeye"
    )

    # Run the docking and get results
    results = docking.run()
    FULL_DF = docking.concat_df()

    with open(docking_pkl, "wb") as f:
        pickle.dump(FULL_DF, f)

    clear_output()
else:
    with open(docking_pkl, "rb") as f:
        FULL_DF = pickle.load(f)

In [5]:
FULL_DF = FULL_DF[~FULL_DF['ID'].str.contains('L17')]
lig_name_series = FULL_DF["ID"].str.split("_").str[0]

FULL_DF.insert(1, "Lig_Name", lig_name_series)

In [9]:
ALL_MOLS = FULL_DF["Molecule"].tolist()

## Paiwise Calculation

```python
clustering_kmed = OptimizedKMedoidsClustering(
    k_range=(2, 20),
    use_dimensionality_reduction=True,
    verbose=True
)
labels_kmed = clustering_kmed.fit(rmsd_matrix)
results_kmed = clustering_kmed.get_results()
```

```python
clustering_dbscan = OptimizedDBSCANClustering(
    eps_range=(0.5, 5.0, 0.5),
    min_samples_range=(2, 15),
    max_noise_percent=15.0,
    max_clusters = 10,
    use_dimensionality_reduction=True,
    verbose=True
)
labels_dbscan = clustering_dbscan.fit(rmsd_matrix)
results_dbscan = clustering_dbscan.get_results()
```

In [None]:
computer = PairwiseMatrixComputer(ALL_MOLS, n_jobs=8, timeout=60)
rmsd_funct = partial(calc_rmsd_mcs_with_timeout, timeout=60)
rmsd_matrix = computer.compute_matrix(rmsd_funct)

In [None]:
clustering_dbscan = OptimizedDBSCANClustering(
    eps_range=(0.5, 5.0, 0.5),
    min_samples_range=(2, 15),
    max_noise_percent=15.0,
    max_clusters = 10,
    use_dimensionality_reduction=True,
    verbose=True
)
labels_dbscan = clustering_dbscan.fit(rmsd_matrix)
results_dbscan = clustering_dbscan.get_results()

In [16]:
FULL_DF["Cluster_DBSCAN"] = labels_dbscan

In [36]:
from pymol import cmd
cmd.reinitialize()
cmd.load(protein_pdb)

def tmp_save(mol: Chem.rdchem.Mol):
    with NamedTemporaryFile(suffix=".sdf", delete=False) as f:
        dm.to_sdf(mol, f.name)
        return f.name

for cluster, df in FULL_DF.groupby("Cluster_DBSCAN"):
    IDs = []
    for ndx, sub_df in df.groupby("Lig_Name"):
        sub_df.sort_values(by="Score", ascending=False, inplace=True)
        TOP = sub_df.iloc[0]
        
        ID = TOP["ID"]
        IDs.append(ID)
        TMP_MOL = tmp_save(TOP["Molecule"])
    
        cmd.load(TMP_MOL, ID)
    
    cmd.group(f"Cluster_{cluster}", " ".join(IDs))

cmd.save("TMP.pse")

## Protein Minimization

In [2]:
from HandsFreeDocking.tools.Protein_Minimization import ProteinMinimizer

```python
protein_minimizer = ProteinMinimizer(FULL_DF, "Molecule", "PDB_Path")
protein_minimizer(protein_pdb, protein_pdb)
```