# DiffDock
Dock a PDB files and a SMILES with [DiffDock](https://github.com/gcorso/DiffDock).

Select Runtime / Run all to run an example PDB file and SMILES.

May require "premium GPU" (colab pro), and even then it may fail on large complexes.

In [None]:
#@title PDB + SMILES input

PDB_id = '' #@param {type:"string"}
SMILES_or_pubchem_id = '' #@param {type:"string"}

#@markdown Download a tar file containing all results?
download_results = True #@param {type:"boolean"}

time: 1.01 ms (started: 2022-10-06 18:10:33 +00:00)


In [None]:
#@title
import os
import requests
import time
from random import random

def download_pdb_file(pdb_id: str) -> str:
    """Download pdb file as a string from rcsb.org"""
    PDB_DIR ="/tmp/pdb/"
    os.makedirs(PDB_DIR, exist_ok=True)

    # url or pdb_id
    if pdb_id.startswith('http'):
        url = pdb_id
        filename = url.split('/')[-1]
    else:
        url = f"http://files.rcsb.org/view/{pdb_id}.pdb"
        filename = f'{pdb_id}.pdb'

    cache_path = os.path.join(PDB_DIR, filename)
    if os.path.exists(cache_path):
        return cache_path

    pdb_req = requests.get(url)
    pdb_req.raise_for_status()
    open(cache_path, 'w').write(pdb_req.text)
    return cache_path

def download_smiles_str(pubchem_id: str, retries:int = 2) -> str:
    """Given a pubchem id, get a smiles string"""
    while True:
        req = requests.get(f"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/CID/{pubchem_id}/property/CanonicalSMILES/CSV")
        smiles_url_csv = req.text if req.status_code == 200 else None
        if smiles_url_csv is not None:
            break
        if retries == 0:
            return None
        time.sleep(1+random())
        retries -= 1

    return smiles_url_csv.splitlines()[1].split(',')[1].strip('"').strip("'") if smiles_url_csv is not None else None

time: 43.5 ms (started: 2022-10-06 18:10:33 +00:00)


In [None]:
if not PDB_id or not SMILES_or_pubchem_id:
    PDB_id = "6agt"
    SMILES_or_pubchem_id = "COc(cc1)ccc1C#N"
    print(f"No input supplied. Using example data: {PDB_id} and {SMILES_or_pubchem_id}")

# to run many PDB+smiles at once, fill in a list of PDB_files and smiles here...
pdb_files = [download_pdb_file(PDB_id)]
smiless = [download_smiles_str(SMILES_or_pubchem_id) if str(SMILES_or_pubchem_id).isnumeric() else SMILES_or_pubchem_id]

with open("/tmp/input_protein_ligand.csv", 'w') as out:
    out.write("protein_path,ligand\n")
    for pdb_file in pdb_files:
        for smiles in smiless:
            out.write(f"{pdb_file},{smiles}\n")

No input supplied. Using example data: 6agt and COc(cc1)ccc1C#N
time: 1.2 s (started: 2022-10-06 18:10:33 +00:00)


## Install prerequisites

In [None]:
!pip install ipython-autotime
%load_ext autotime

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 3.63 s (started: 2022-10-06 18:10:35 +00:00)


In [None]:
if not os.path.exists("/content/DiffDock"):
    %cd /content
    !git clone https://github.com/gcorso/DiffDock.git
    %cd /content/DiffDock
    !git checkout 0f9c419 # remove/update for more up to date code

/content
Cloning into 'DiffDock'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (10/10), done.[K
remote: Total 73 (delta 5), reused 6 (delta 2), pack-reused 61[K
Unpacking objects: 100% (73/73), done.
/content/DiffDock
Note: checking out '0f9c419'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at 0f9c419 improve README
time: 15.6 s (started: 2022-10-06 18:10:38 +00:00)


In [None]:
!pip install pyg==0.7.1 --quiet
!pip install pyyaml==6.0 --quiet
!pip install scipy==1.7.3 --quiet
!pip install networkx==2.6.3 --quiet
!pip install biopython==1.79 --quiet
!pip install rdkit-pypi==2022.03.5 --quiet
!pip install e3nn==0.5.0 --quiet
!pip install spyrmsd==0.5.2 --quiet
!pip install pandas==1.3.5 --quiet
!pip install biopandas==0.4.1 --quiet
!pip install torch==1.12.1+cu113 --quiet

[?25l[K     |█████                           | 10 kB 32.5 MB/s eta 0:00:01[K     |██████████                      | 20 kB 36.5 MB/s eta 0:00:01[K     |███████████████▏                | 30 kB 44.6 MB/s eta 0:00:01[K     |████████████████████▏           | 40 kB 39.5 MB/s eta 0:00:01[K     |█████████████████████████▏      | 51 kB 43.4 MB/s eta 0:00:01[K     |██████████████████████████████▎ | 61 kB 48.0 MB/s eta 0:00:01[K     |████████████████████████████████| 65 kB 3.6 MB/s 
[?25h  Building wheel for pyg (setup.py) ... [?25l[?25hdone
  Building wheel for pkgtools (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 2.3 MB 31.0 MB/s 
[K     |████████████████████████████████| 36.8 MB 19 kB/s 
[K     |████████████████████████████████| 117 kB 28.8 MB/s 
[K     |████████████████████████████████| 878 kB 27.5 MB/s 
[?25htime: 47.2 s (started: 2022-10-06 18:10:54 +00:00)


In [None]:
import torch

try:
    import torch_geometric
except ModuleNotFoundError:
    !pip uninstall torch-scatter torch-sparse torch-geometric torch-cluster  --y
    !pip install torch-scatter -f https://data.pyg.org/whl/torch-{torch.__version__}.html --quiet
    !pip install torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html --quiet
    !pip install torch-cluster -f https://data.pyg.org/whl/torch-{torch.__version__}.html --quiet
    !pip install git+https://github.com/pyg-team/pytorch_geometric.git  --quiet # no version for some reason??

[K     |████████████████████████████████| 7.9 MB 34.1 MB/s 
[K     |████████████████████████████████| 3.5 MB 35.7 MB/s 
[K     |████████████████████████████████| 2.4 MB 30.9 MB/s 
[?25h  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
time: 26.4 s (started: 2022-10-06 18:11:41 +00:00)


### Download 2GB PDBBind dataset
unnecessary for inference

In [None]:
#!test -d /content/DiffDock/data/PDBBind_processed || (wget https://zenodo.org/record/6034088/files/PDBBind.zip && unzip -q PDBBind.zip && mv PDBBind_processed /content/DiffDock/data/)

time: 384 µs (started: 2022-10-06 18:12:08 +00:00)


## Install ESM and prepare PDB file for ESM

In [None]:
if not os.path.exists("/content/DiffDock/esm"):
    %cd /content/DiffDock
    !git clone https://github.com/facebookresearch/esm
    %cd /content/DiffDock/esm
    !git checkout f07aed6 # remove/update for more up to date code
    !sudo pip install -e .
    %cd /content/DiffDock

/content/DiffDock
Cloning into 'esm'...
remote: Enumerating objects: 628, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 628 (delta 4), reused 7 (delta 3), pack-reused 612[K
Receiving objects: 100% (628/628), 10.51 MiB | 15.35 MiB/s, done.
Resolving deltas: 100% (363/363), done.
/content/DiffDock/esm
Note: checking out 'f07aed6'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by performing another checkout.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -b with the checkout command again. Example:

  git checkout -b <new-branch-name>

HEAD is now at f07aed6 fix fairscale inference example (#298)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Obtaining file:///content

In [None]:
%cd /content/DiffDock
!python datasets/esm_embedding_preparation.py --protein_ligand_csv /tmp/input_protein_ligand.csv --out_file data/prepared_for_esm.fasta 

/content/DiffDock
100% 1/1 [00:00<00:00,  3.93it/s]
time: 923 ms (started: 2022-10-06 18:12:35 +00:00)


In [None]:
%cd /content/DiffDock
%env HOME=esm/model_weights
%env PYTHONPATH=$PYTHONPATH:/content/DiffDock/esm
!python /content/DiffDock/esm/scripts/extract.py esm2_t33_650M_UR50D data/prepared_for_esm.fasta data/esm2_output --repr_layers 33 --include per_tok

/content/DiffDock
env: HOME=esm/model_weights
env: PYTHONPATH=$PYTHONPATH:/content/DiffDock/esm
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t33_650M_UR50D.pt" to esm/model_weights/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t33_650M_UR50D-contact-regression.pt" to esm/model_weights/.cache/torch/hub/checkpoints/esm2_t33_650M_UR50D-contact-regression.pt
Transferred model to GPU
Read data/prepared_for_esm.fasta with 4 sequences
Processing 1 of 1 batches (4 sequences)
time: 3min 6s (started: 2022-10-06 18:12:36 +00:00)


## Run DiffDock

In [None]:
%cd /content/DiffDock
!python -m inference --protein_ligand_csv /tmp/input_protein_ligand.csv --out_dir results/user_predictions_small --inference_steps 20 --samples_per_complex 40 --batch_size 10

## Download results

In [None]:
if download_results:
    from google.colab import files
    from glob import glob
    from shlex import quote

    %cd /content/DiffDock/results/user_predictions_small
    out_fs = []
    for pdb_file in pdb_files:
        !cp {pdb_file} .
        pdb_file_root = pdb_file.split("/")[-1]
        out_fs.append(f"{pdb_file_root}")
        for smiles in smiless:
            sglob = ''.join([c if c in "CONH" else "?" for c in smiles])
            out_fs += glob(f"*{pdb_file_root}*{sglob}*/rank*_*.sdf")

    out_fs_bash = ' '.join([quote(f) for f in out_fs])
    !tar cvf diffdock_results.tar {out_fs_bash}
    files.download("diffdock_results.tar")