# Solution-1
This tutorial shows how to identify drug molecules in the PDB by joining two datasets: 

1. Drug information from DrugBank
2. Ligand information from RCSB PDB

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring_index
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.interactions import InteractionFilter, InteractionFingerprinter
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
#from IPython.display import Markdown, display
import py3Dmol

#### Configure Spark

In [2]:
spark = SparkSession.builder.master("local[4]").appName("2-JoiningDatasets").getOrCreate()
sc = spark.sparkContext

##
[See examples](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/PDBMetaDataDemo.ipynb)
[SIFTS demo](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/SiftsDataDemo.ipynb)

For our first task, we need to run a taxonomy query. To figure out how to query for taxonomy, the command below lists the first 10 entries for the SIFTS taxonomy table. As you can see, we can use the science_name field to query for a specific organism.

In [3]:
taxonomyQuery = "SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10"
taxonomy = pdbjMineDataset.get_dataset(taxonomyQuery)
taxonomy.show()

+-----+-----+------+--------------------+----------------+
|pdbid|chain|tax_id|     scientific_name|structureChainId|
+-----+-----+------+--------------------+----------------+
| 101M|    A|  9755|               PHYCD|          101M.A|
| 101M|    A|  9755|    Physeter catodon|          101M.A|
| 101M|    A|  9755|Physeter catodon ...|          101M.A|
| 101M|    A|  9755|Physeter catodon ...|          101M.A|
| 101M|    A|  9755|Physeter macrocep...|          101M.A|
| 101M|    A|  9755|         Sperm whale|          101M.A|
| 101M|    A|  9755|         sperm whale|          101M.A|
| 102L|    A| 10665|                BPT4|          102L.A|
| 102L|    A| 10665|    Bacteriophage T4|          102L.A|
| 102L|    A| 10665|Enterobacteria ph...|          102L.A|
+-----+-----+------+--------------------+----------------+



In [4]:
taxonomyQuery = "SELECT * FROM sifts.pdb_chain_taxonomy WHERE scientific_name = 'Mus musculus'"
taxonomy = pdbjMineDataset.get_dataset(taxonomyQuery)
taxonomy.show(10)

+-----+-----+------+---------------+----------------+
|pdbid|chain|tax_id|scientific_name|structureChainId|
+-----+-----+------+---------------+----------------+
| 12E8|    H| 10090|   Mus musculus|          12E8.H|
| 12E8|    L| 10090|   Mus musculus|          12E8.L|
| 12E8|    M| 10090|   Mus musculus|          12E8.M|
| 12E8|    P| 10090|   Mus musculus|          12E8.P|
| 15C8|    H| 10090|   Mus musculus|          15C8.H|
| 15C8|    L| 10090|   Mus musculus|          15C8.L|
| 1914|    A| 10090|   Mus musculus|          1914.A|
| 1A0Q|    H| 10090|   Mus musculus|          1A0Q.H|
| 1A0Q|    L| 10090|   Mus musculus|          1A0Q.L|
| 1A14|    H| 10090|   Mus musculus|          1A14.H|
+-----+-----+------+---------------+----------------+
only showing top 10 rows



In [5]:
path = "../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, sc, fraction=0.1)

In [6]:
pdb = pdb.filter(PdbjMineSearch(taxonomyQuery)).cache()

In [7]:
interactionFilter = InteractionFilter(distanceCutoff=4.5, minInteractions=10)

interactions = InteractionFingerprinter.get_polymer_interactions(pdb, interactionFilter).cache()

In [8]:
interactions = interactions.withColumn("structureId", substring_index(interactions.structureChainId, '.', 1)).cache()
interactions.toPandas().head(10)

Unnamed: 0,structureChainId,queryChainId,targetChainId,groupNumbers,sequenceIndices,sequence,structureId
0,4M48.A,H,A,"[337, 338, 498, 502, 505, 506, 508, 509, 510, ...","[70, 274, 275, 435, 439, 442, 443, 445, 446, 4...",MNSISDERETWSGKVDFLLSVIGFAVDLANVWRFPYLCYKNGGGAF...,4M48
1,4M48.H,A,H,"[100, 101, 102, 103, 33, 50, 52, 53, 54, 55, 5...","[51, 68, 70, 71, 72, 73, 74, 75, 77, 117, 118,...",MNFGLRLVFLVLILKGVQCEVQLVESGGGLVKPGGSLKLSCAASGF...,4M48
2,4M48.L,H,L,"[117, 119, 120, 122, 124, 125, 128, 132, 134, ...","[53, 54, 56, 58, 60, 64, 65, 66, 68, 71, 72, 1...",MDFQVQIFSFLLISASVAMSRGENVLTQSPAIMSTSPGEKVTMTCR...,4M48
3,4M48.H,L,H,"[102, 103, 104, 105, 106, 107, 109, 110, 128, ...","[55, 57, 61, 62, 63, 64, 65, 68, 77, 79, 80, 1...",MNFGLRLVFLVLILKGVQCEVQLVESGGGLVKPGGSLKLSCAASGF...,4M48
4,4NN5.A,C,A,"[130, 133, 134, 136, 137, 138, 34, 35, 39, 43,...","[14, 15, 19, 23, 31, 32, 33, 34, 35, 36, 37, 3...",YNFSNCNFTSITKIYCNIIFHDLTGDLKGAKFEQIEDCESKPACLL...,4NN5
5,4NN5.C,A,C,"[106, 107, 108, 109, 110, 112, 192, 193, 194, ...","[68, 69, 70, 71, 86, 87, 88, 89, 90, 92, 172, ...",AAAVTSRGDVTVVCHDLETVEVTWGSGPDHHGANLSLEFRYGTGAL...,4NN5
6,2QDQ.A,B,A,"[2497, 2500, 2501, 2504, 2505, 2507, 2508, 251...","[5, 8, 9, 12, 13, 15, 16, 19, 20, 22, 23, 26, ...",GAMVGGIAQIIAAQEEMLRKERELEEARKKLAQIRQQQYKFLPSEL...,2QDQ
7,2QDQ.B,A,B,"[2497, 2500, 2501, 2504, 2505, 2507, 2508, 251...","[5, 8, 9, 12, 13, 15, 16, 19, 20, 22, 23, 26, ...",GAMVGGIAQIIAAQEEMLRKERELEEARKKLAQIRQQQYKFLPSEL...,2QDQ
8,4P3A.C,D,C,"[698, 701, 702, 705, 708, 709, 710, 712, 718, ...","[21, 24, 25, 28, 31, 32, 33, 35, 41, 44, 45, 4...",GANLHLLRQKIEEQAAKYKHSVPKKCCYDGARVNFYETCEERVARV...,4P3A
9,4P3A.D,C,D,"[698, 701, 702, 705, 708, 709, 710, 712, 713, ...","[21, 24, 25, 28, 31, 32, 33, 35, 36, 41, 44, 4...",GANLHLLRQKIEEQAAKYKHSVPKKCCYDGARVNFYETCEERVARV...,4P3A


## Visualize drug binding sites

#### Extract id columns as lists (required for visualization)

In [9]:
structure_ids = interactions.select("structureId").rdd.flatMap(lambda x: x).collect()
query_chain_ids = interactions.select("queryChainID").rdd.flatMap(lambda x: x).collect()
target_chain_ids = interactions.select("targetChainID").rdd.flatMap(lambda x: x).collect()
target_groups = interactions.select("groupNumbers").rdd.flatMap(lambda x: x).collect()
#target_groups

Disable scrollbar for the visualization below

In [10]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

#### Show protein-protein interactions 4.5 A (query = orange, target = blue)

In [11]:
def view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, target_groups, distance=4.5):
    
    def view3d(i=0):
        
        print(f"PDB: {structure_ids[i]}, query: {query_chain_ids[i]}, target: {target_chain_ids[i]}")

        target = {'chain': target_chain_ids[i], 'resi': target_groups[i]}
        query = {'chain': target_chain_ids[i], 'resi': target_groups[i], 'byres': 'true', 'expand': distance}
     
        viewer = py3Dmol.view(query='pdb:' + structure_ids[i])

        viewer.setStyle({'chain': query_chain_ids[i]}, {'line': {'colorscheme': 'orangeCarbon'}})
#viewer.setStyle(query, {'sphere': {'colorscheme': 'orangeCarbon'}})
        viewer.setStyle({'chain': target_chain_ids[i]}, {'line': {'colorscheme': 'lightblueCarbon'}})
        viewer.setStyle(target, {'stick': {'colorscheme': 'lightblueCarbon'}})
        viewer.zoomTo(target)

        return viewer.show()

    s_widget = IntSlider(min=0, max=len(structure_ids)-1, description='Structure', continuous_update=False)
    return interact(view3d, i=s_widget)

In [12]:
view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, \
                                  target_groups, distance=4.5);

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=43), Output()),…

In [13]:
spark.stop()