# Problem-1
This tutorial shows how to find proteins for a specific organism, how to calculate protein-protein interactions, and visualize the results.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring_index
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.interactions import InteractionFilter, InteractionFingerprinter
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
import py3Dmol

#### Configure Spark

In [2]:
spark = SparkSession.builder.master("local[4]").appName("Problem-1").getOrCreate()
sc = spark.sparkContext

## Find protein structures for mouse

For our first task, we need to run a taxonomy query using SIFTS data. [See examples](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/PDBMetaDataDemo.ipynb) and [SIFTS demo](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/SiftsDataDemo.ipynb)

To figure out how to query for taxonomy, the command below lists the first 10 entries for the SIFTS taxonomy table. As you can see, we can use the science_name field to query for a specific organism.

In [3]:
taxonomy_query = "SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10"
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show()

+-----+-----+------+--------------------+----------------+
|pdbid|chain|tax_id|     scientific_name|structureChainId|
+-----+-----+------+--------------------+----------------+
| 101M|    A|  9755|               PHYCD|          101M.A|
| 101M|    A|  9755|    Physeter catodon|          101M.A|
| 101M|    A|  9755|Physeter catodon ...|          101M.A|
| 101M|    A|  9755|Physeter catodon ...|          101M.A|
| 101M|    A|  9755|Physeter macrocep...|          101M.A|
| 101M|    A|  9755|         Sperm whale|          101M.A|
| 101M|    A|  9755|         sperm whale|          101M.A|
| 102L|    A| 10665|                BPT4|          102L.A|
| 102L|    A| 10665|    Bacteriophage T4|          102L.A|
| 102L|    A| 10665|Enterobacteria ph...|          102L.A|
+-----+-----+------+--------------------+----------------+



### TODO-1: specify a taxonomy query where the scientific name is 'Mus musculus'

In [5]:
taxonomy_query = "SELECT * FROM sifts.pdb_chain_taxonomy WHERE scientific_name = 'Mus musculus'"
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show(10)

+-----+-----+------+---------------+----------------+
|pdbid|chain|tax_id|scientific_name|structureChainId|
+-----+-----+------+---------------+----------------+
| 1CF8|    H| 10090|   Mus musculus|          1CF8.H|
| 1CF8|    L| 10090|   Mus musculus|          1CF8.L|
| 1CFN|    A| 10090|   Mus musculus|          1CFN.A|
| 1CFN|    B| 10090|   Mus musculus|          1CFN.B|
| 1CFQ|    A| 10090|   Mus musculus|          1CFQ.A|
| 1CFQ|    B| 10090|   Mus musculus|          1CFQ.B|
| 1CFS|    A| 10090|   Mus musculus|          1CFS.A|
| 1CFS|    B| 10090|   Mus musculus|          1CFS.B|
| 1CFT|    A| 10090|   Mus musculus|          1CFT.A|
| 1CFT|    B| 10090|   Mus musculus|          1CFT.B|
+-----+-----+------+---------------+----------------+
only showing top 10 rows



In [6]:
path = "../resources/mmtf_full_sample/"

pdb = mmtfReader.read_sequence_file(path, sc, fraction=0.1)

In [7]:
### TODO-2: Take the taxonomy query from above and use it to filter the pdb structures

In [8]:
pdb = pdb.filter(PdbjMineSearch(taxonomy_query)).cache()

## Calculate polymer-polymer interactions for this subset of structures
Find protein-protein interactions with a 6 A distance cutoff

In [9]:
distance_cutoff = 6.0
interactionFilter = InteractionFilter(distance_cutoff, minInteractions=10)

interactions = InteractionFingerprinter.get_polymer_interactions(pdb, interactionFilter).cache()

In [10]:
interactions = interactions.withColumn("structureId", substring_index(interactions.structureChainId, '.', 1)).cache()
interactions.toPandas().head(10)

Unnamed: 0,structureChainId,queryChainId,targetChainId,groupNumbers,sequenceIndices,sequence,structureId
0,1MZ9.A,E,A,"[27, 29, 30, 32, 33, 34, 35, 36, 37, 39, 40, 4...","[0, 2, 3, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 1...",MDLAPQMLRELQETNAALQDVRELLRQQVKEITFLKNTVMECDAC,1MZ9
1,1MZ9.E,A,E,"[27, 29, 30, 31, 32, 33, 34, 37, 38, 40, 41, 4...","[0, 2, 3, 4, 5, 6, 7, 10, 11, 13, 14, 17, 18, ...",MDLAPQMLRELQETNAALQDVRELLRQQVKEITFLKNTVMECDAC,1MZ9
2,1MZ9.D,E,D,"[27, 30, 31, 32, 33, 34, 37, 38, 39, 40, 41, 4...","[0, 3, 4, 5, 6, 7, 10, 11, 12, 13, 14, 17, 18,...",MDLAPQMLRELQETNAALQDVRELLRQQVKEITFLKNTVMECDAC,1MZ9
3,1MZ9.E,D,E,"[27, 28, 29, 32, 33, 34, 36, 37, 38, 39, 40, 4...","[0, 1, 2, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, ...",MDLAPQMLRELQETNAALQDVRELLRQQVKEITFLKNTVMECDAC,1MZ9
4,4YI0.C,A,C,"[101, 102, 103, 104, 105, 106, 107, 108, 109, ...","[31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 42, 6...",NQGTVNWSVEDIVKGINSNNLESQLQATQAARKLLSREKQPPIDNI...,4YI0
5,4YI0.A,C,A,"[293, 294, 295, 296, 297, 298, 303, 304, 305, ...","[2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 17, 18, 19,...",LKEKKKRTVAEEDQLHLDGQENKRRRHDSS,4YI0
6,2XSD.C,A,C,"[271, 288, 289, 292, 293, 296, 298, 343, 344, ...","[32, 49, 50, 53, 54, 57, 59, 104, 105, 106, 11...",GGEHSDEDAPSSDDLEQFAKQFKQRRIKLGFTQADVGLALGTLYGN...,2XSD
7,2XSD.B,C,B,"[202, 203, 204, 205, 206, 207, 208, 209, 210, ...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",CCTCATGCATA,2XSD
8,2XSD.C,B,C,"[285, 286, 287, 288, 289, 290, 293, 298, 299, ...","[46, 47, 48, 49, 50, 51, 54, 59, 60, 61, 63, 6...",GGEHSDEDAPSSDDLEQFAKQFKQRRIKLGFTQADVGLALGTLYGN...,2XSD
9,1G2Y.B,D,B,"[10, 12, 13, 16, 17, 19, 20, 21, 22, 23, 24, 2...","[4, 7, 8, 9, 11, 12, 15, 16, 18, 19, 20, 21, 2...",MVSKLSQLQTEMLAALLESGLSKEALIQALGE,1G2Y


## Visualize the protein-protein interactions

#### Extract id columns as lists (required for visualization)

In [11]:
structure_ids = interactions.select("structureId").rdd.flatMap(lambda x: x).collect()
query_chain_ids = interactions.select("queryChainID").rdd.flatMap(lambda x: x).collect()
target_chain_ids = interactions.select("targetChainID").rdd.flatMap(lambda x: x).collect()
target_groups = interactions.select("groupNumbers").rdd.flatMap(lambda x: x).collect()

Disable scrollbar for the visualization below

In [12]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

#### Show protein-protein interactions within cutoff distance  (query = orange, target = blue)

In [13]:
def view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, target_groups, distance=4.5):
    
    def view3d(i=0):
        
        print(f"PDB: {structure_ids[i]}, query: {query_chain_ids[i]}, target: {target_chain_ids[i]}")

        target = {'chain': target_chain_ids[i], 'resi': target_groups[i]}
           
        viewer = py3Dmol.view(query='pdb:' + structure_ids[i], width=600, height=600)
        viewer.setStyle({})

        viewer.setStyle({'chain': query_chain_ids[i]}, {'line': {'colorscheme': 'orangeCarbon'}})
        viewer.setStyle({'chain' : query_chain_ids[i], 'within':{'distance' : distance, 'sel':{'chain': target_chain_ids[i]}}}, {'sphere': {'colorscheme': 'orangeCarbon'}});                   
        viewer.setStyle({'chain': target_chain_ids[i]}, {'line': {'colorscheme': 'lightblueCarbon'}})
        viewer.setStyle(target, {'stick': {'colorscheme': 'lightblueCarbon'}})
        viewer.zoomTo(target)

        return viewer.show()

    s_widget = IntSlider(min=0, max=len(structure_ids)-1, description='Structure', continuous_update=False)
    return interact(view3d, i=s_widget)

In [14]:
view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, \
                                  target_groups, distance=distance_cutoff);

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=50), Output()),…

In [13]:
spark.stop()