# PockDock

This notebook demonstrates how a variety of different tools can be glued together into an efficient and flexible workflow using **Crossflow**.

The workflow downloads a protein-ligand complex form the PDB, runs fpocket, then docks the ligand back into the biggest pocket found. Then it calculates the error between the crystal structure coordinates of the ligand and those of each docking pose, before and after least-squares fitting.

The notebook requires you to have versions of **fpocket**, **autodock tools** and **autodock vina** installed on the worker node(s) of your dask cluster, which either:

 - is up and running and identifiable via the file "dask.dat" in the current directory (created when dask-scheduler is started with the `--scheduler_file` option)
 - will be a local cluster created right here, in which case scheduler_file=None

In [None]:
from crossflow import filehandling, tasks, clients
import sys
from urllib.request import urlretrieve
import numpy as np
import mdtraj as mdt

Create a crossflow client, connected to a pool of workers (see introductory notes):

In [None]:
scheduler_file = None
client = clients.Client(scheduler_file=scheduler_file)
client

Make the SubprocessTasks for **fpocket** and **Vina**, and FunctionTasks for other tasks:

In [None]:
# The fpocket task:
fpocket = tasks.SubprocessTask('fpocket -f x.pdb')
fpocket.set_inputs(['x.pdb'])
fpocket.set_outputs(['x_out/x_out.pdb'])

In [None]:
# The vina task:
vina = tasks.SubprocessTask('vina --receptor r.pdbqt --ligand l.pdbqt --out out.pdbqt'
                                 ' --center_x {xc} --center_y {yc} --center_z {zc}'
                                 ' --size_x {sx} --size_y {sy} --size_z {sz} > dock.log')
vina.set_inputs(['r.pdbqt', 'l.pdbqt', 'xc', 'yc', 'zc', 'sx', 'sy', 'sz'])
vina.set_outputs(['out.pdbqt', 'dock.log'])

In [None]:
# AutoDock Tool based tasks to prepare receptor and ligand for docking:
prep_receptor = tasks.SubprocessTask('adt prepare_receptor4.py -r x.pdb -o x.pdbqt')
prep_receptor.set_inputs(['x.pdb'])
prep_receptor.set_outputs(['x.pdbqt'])

prep_ligand = tasks.SubprocessTask('adt prepare_ligand4.py -l x.pdb -o x.pdbqt')
prep_ligand.set_inputs(['x.pdb'])
prep_ligand.set_outputs(['x.pdbqt'])

In [None]:

def _download_and_split(pdb_code, ligand_residue_name):
    '''
    A function to download a pdb file, and split into receptor and ligand
    
    Args:
        pdb_code (str): 4-letter PDB code
        ligand_residue_name (str): 3-letter residue name
        
    Returns:
        mdt.trajectory: the receptor (protein atoms only)
        mdt.trajectory: the ligand
    '''
    pdb_file = pdb_code + '.pdb'
    path = urlretrieve('http://files.rcsb.org/download/' + pdb_file, pdb_file)
    hydrated_complex = mdt.load(pdb_file)
    receptor_atoms = hydrated_complex.topology.select('protein and chainid 0')
    found = False
    for chain in hydrated_complex.topology.chains:
        for r in chain.residues:
            if r.name == ligand_residue_name and not found:
                cid = chain.index
                found = True

    ligand_atoms = hydrated_complex.topology.select('resname {} and chainid {}'.format(ligand_residue_name, cid))
    receptor = mdt.load(pdb_file, atom_indices=receptor_atoms)
    ligand = mdt.load(pdb_file, atom_indices=ligand_atoms)
    return receptor, ligand
# Now make a FunctionTask for it:
download_and_split = tasks.FunctionTask(_download_and_split)
download_and_split.set_inputs(['pdb_code', 'ligand_residue_name'])
download_and_split.set_outputs(['receptor', 'ligand'])

In [None]:
def _pdbqt2pdb(infile):
    '''
    A Function to convert pdbqt files back to pdb ones
    
    Args:
        infile (str): name of the input file, .pdbqt format
    
    Returns:
        str: name of the .pdb file (always 'tmp.pdb')
    '''
    outfile = 'tmp.pdb'
    fout = open(outfile, 'w')
    with open(infile, 'r') as fin:
        for line in fin:
            if line[:6] in ['ATOM  ', 'HETATM', 'MODEL ','ENDMDL']:
                fout.write(line)       
    fout.close()
    return 'tmp.pdb'

# Now make a FunctionTask for this:
pdbqt2pdb = tasks.FunctionTask(_pdbqt2pdb)
pdbqt2pdb.set_inputs(['infile'])
pdbqt2pdb.set_outputs(['outfile'])

In [None]:
def _get_dimensions(pockets):
    '''
    A Function to find the centre and extents of the largest pocket found by fpocket
    
    Args:
        pockets (str): Name of the pdb format file produced by fpocket
        
    Returns:
        (float,) * 6: the pocket centre and extents in x/y/z - in Angstroms
    '''
    buffer = 2.0
    t = mdt.load(pockets)
    site = t.topology.select('resname STP and residue 1') # This should be the largest pocket
    # In the next two lines, the factor of 10 is a conversion from nanometres to Angstroms:
    xc, yc, zc = tuple(10 * (t.xyz[0][site].min(axis=0) + t.xyz[0][site].max(axis=0)) / 2)
    sx, sy, sz = tuple(10 * (t.xyz[0][site].max(axis=0) - t.xyz[0][site].min(axis=0)) + buffer)
    return xc, yc, zc, sx, sy, sz

# Now make a FunctionTask for this:
get_dimensions = tasks.FunctionTask(_get_dimensions)
get_dimensions.set_inputs(['pockets'])
get_dimensions.set_outputs(['xc', 'yc', 'zc', 'sx', 'sy', 'sz'])

Now we construct the workflow. For convenience it's split up here into sections.

In [None]:
pdb_code = '1qy1'
ligand_residue_name = 'PRZ'

receptor, ligand = client.submit(download_and_split, pdb_code, ligand_residue_name)
print(ligand.result())

In [None]:
# Run fpocket:
pockets = client.submit(fpocket, receptor)

In [None]:
# Find the dimensions of the biggest pocket
xc, yc, zc, sx, sy, sz = client.submit(get_dimensions, pockets)

In [None]:
# Prepare receptor and ligand for docking:
receptor_qt = client.submit(prep_receptor, receptor)
ligand_qt = client.submit(prep_ligand, ligand)

In [None]:
# Run vina:
docks, logfile = client.submit(vina, receptor_qt, ligand_qt, xc, yc, zc, sx, sy, sz)

In [None]:
# Check the log file:
print(logfile.result().read_text())

In [None]:
# Convert the docked poses back to PDB format, and calculate unfitted and fitted rmsds 
# using MDTraj:
pdbout = client.submit(pdbqt2pdb, docks)
docktraj = mdt.load(pdbout.result())

dxyz = docktraj.xyz - ligand.result().xyz
msd = (dxyz * dxyz).sum(axis=2).mean(axis=1)
unfitted_rmsd = np.sqrt(msd) * 10.0 # nm -> angstroms

rmsd = mdt.rmsd(docktraj, ligand.result()) * 10.0 
print('Pose Fitted   Unfitted')
print('      rmsd      rmsd')
for mode in range(len(docktraj)):
    print('{:3d}   {:5.3f}    {:6.3f}'.format(mode+1, rmsd[mode], unfitted_rmsd[mode]))