# Problem-1
This tutorial shows how to find proteins for a specific organism, how to calculate protein-protein interactions, and visualize the results.

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import substring_index
from mmtfPyspark.datasets import pdbjMineDataset
from mmtfPyspark.webfilters import PdbjMineSearch
from mmtfPyspark.interactions import InteractionFilter, InteractionFingerprinter
from mmtfPyspark.io import mmtfReader
from ipywidgets import interact, IntSlider
import py3Dmol

from mmtfPyspark.structureViewer import view_structure

#### Configure Spark

In [2]:
spark = SparkSession.builder.master("local[4]").appName("Problem-1").getOrCreate()
sc = spark.sparkContext

## Find protein structures for mouse

For our first task, we need to run a taxonomy query using SIFTS data. [See examples](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/PDBMetaDataDemo.ipynb) and [SIFTS demo](https://github.com/sbl-sdsc/mmtf-pyspark/blob/master/demos/datasets/SiftsDataDemo.ipynb)

To figure out how to query for taxonomy, the command below lists the first 10 entries for the SIFTS taxonomy table. As you can see, we can use the science_name field to query for a specific organism.

In [3]:
taxonomy_query = "SELECT * FROM sifts.pdb_chain_taxonomy LIMIT 10"
taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show()

+-----+-----+------+--------------------+----------------+
|pdbid|chain|tax_id|     scientific_name|structureChainId|
+-----+-----+------+--------------------+----------------+
| 5DQL|    D|652616|Mycobacterium tub...|          5DQL.D|
| 5DQL|    D|652616|Mycobacterium tub...|          5DQL.D|
| 5DQM|    A|210840|Aequorea coerules...|          5DQM.A|
| 5DQM|    A|210840|Aequorea coerules...|          5DQM.A|
| 5DQM|    A|210840|      belt jellyfish|          5DQM.A|
| 5DQN|    A|246196|               MYCS2|          5DQN.A|
| 5DQN|    A|246196|Mycobacterium sme...|          5DQN.A|
| 5DQN|    A|246196|Mycobacterium sme...|          5DQN.A|
| 5DQN|    A|246196|Mycobacterium sme...|          5DQN.A|
| 5DQN|    A|246196|Mycobacterium sme...|          5DQN.A|
+-----+-----+------+--------------------+----------------+



### TODO-1: specify a taxonomy query where the scientific name is 'Mus musculus'

In [4]:
taxonomy_query = """ 
SELECT keyword_search.pdbid FROM keyword_search('ribosome')
INNER JOIN entity ON keyword_search.pdbid = entity.pdbid
WHERE entity.pdbx_number_of_molecules > 100 
"""

taxonomy = pdbjMineDataset.get_dataset(taxonomy_query)
taxonomy.show(10)

+-----------+
|structureId|
+-----------+
|       4B3R|
|       4B3T|
|       4DR1|
|       4DR1|
|       4DR2|
|       4DR2|
|       4DR3|
|       4DR3|
|       4DR5|
|       4DR5|
+-----------+
only showing top 10 rows



In [5]:
path = './ribo_mmtf_sample/'

pdb = mmtfReader.read_mmtf_files(path, sc)

In [6]:
view_structure(pdb.keys().collect());

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=2), Output()), …

In [7]:
### TODO-2: Take the taxonomy query from above and use it to filter the pdb structures

In [8]:
#pdb = pdb.filter(PdbjMineSearch(taxonomy_query)).cache()

## Calculate polymer-polymer interactions for this subset of structures
Find protein-protein interactions with a 6 A distance cutoff

In [9]:
distance_cutoff = 6.0
interactionFilter = InteractionFilter(distance_cutoff, minInteractions=10)

interactions = InteractionFingerprinter.get_polymer_interactions(pdb, interactionFilter).cache()

In [10]:
interactions = interactions.withColumn("structureId", substring_index(interactions.structureChainId, '.', 1)).cache()

In [11]:
interactions.toPandas().head(10)

Unnamed: 0,structureChainId,queryChainId,targetChainId,groupNumbers,sequenceIndices,sequence,structureId
0,4UG0.S2,Se,S2,"[523, 524, 525, 526, 527, 534, 535, 550, 551, ...","[522, 523, 524, 525, 526, 533, 534, 549, 550, ...",UACCUGGUUGAUCCUGCCAGUAGCAUAUGCUUGUCUCAAAGAUUAA...,4UG0
1,4UG0.Se,S2,Se,"[10, 11, 12, 13, 15, 16, 17, 19, 20, 21, 22, 2...","[7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 20, ...",KVHGSLARAGKVRGQTPKVAKQEKKKKKTGRAKRRMQYNRRFVNVV...,4UG0
2,4UG0.SX,Se,SX,"[134, 54, 55, 56, 58, 66, 68, 69, 71, 87, 88, ...","[53, 54, 55, 57, 65, 67, 68, 70, 86, 87, 88, 8...",MGKCRGLRTARKLRSHRRDQKWHDKQYKKAHLGTALKANPFGGASH...,4UG0
3,4UG0.Se,SX,Se,"[10, 11, 12, 13, 2, 3, 5, 6, 7, 8, 9]","[1, 2, 4, 5, 6, 7, 8, 9, 10, 11, 12]",KVHGSLARAGKVRGQTPKVAKQEKKKKKTGRAKRRMQYNRRFVNVV...,4UG0
4,4UG0.SJ,Se,SJ,"[122, 124, 125, 127, 128, 25, 26, 27, 29, 30, ...","[24, 25, 26, 28, 29, 30, 31, 32, 33, 35, 36, 3...",MPVARSWVCRKTYVTPRRPFEKSRLDQELKLIGEYGLRNKREVWRV...,4UG0
5,1VY4.DA,D9,DA,"[1021, 1029, 1030, 1031, 1032, 1033, 1034, 112...","[1065, 1073, 1074, 1075, 1076, 1077, 1078, 116...",GUCAAGAUGGUAAGGGCCCACGGUGGAUGCCUCGGCACCCGAGCCG...,1VY4
6,1VY4.D9,DA,D9,"[1, 10, 15, 16, 17, 18, 19, 2, 20, 21, 22, 23,...","[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 14, 15, 16, 17,...",MKVRASVKRICDKCKVIRRHGRVYVICENPKHKQRQG,1VY4
7,4V6U.BY,B3,BY,"[10, 129, 130, 131, 132, 133, 134, 135, 136, 1...","[9, 13, 15, 47, 48, 50, 51, 52, 128, 129, 130,...",MAKLAVIRIRGRVNVKRPVRDTLAMLRLHRVNHCVIVDDTPSYLGM...,4V6U
8,4V6U.B3,BY,B3,"[81, 82, 83, 84, 85, 93, 94, 95, 96, 97]","[80, 81, 82, 83, 84, 92, 93, 94, 95, 96]",CGGCGGCCAUAGCGGGGGGGCCACACCCGGUCUCAUUUCGAACCCG...,4V6U
9,4V6U.BO,B3,BO,"[1, 10, 105, 109, 11, 110, 112, 113, 114, 115,...","[0, 1, 2, 3, 4, 5, 8, 9, 10, 11, 12, 14, 15, 1...",MAHGPRYRVPFRRRREGKTNYRKRLKLLKSGKPRLVVRKSLNHHIA...,4V6U


## Visualize the protein-protein interactions

#### Extract id columns as lists (required for visualization)

In [12]:
structure_ids = interactions.select("structureId").rdd.flatMap(lambda x: x).collect()
query_chain_ids = interactions.select("queryChainID").rdd.flatMap(lambda x: x).collect()
target_chain_ids = interactions.select("targetChainID").rdd.flatMap(lambda x: x).collect()
target_groups = interactions.select("groupNumbers").rdd.flatMap(lambda x: x).collect()

Disable scrollbar for the visualization below

In [13]:
%%javascript 
IPython.OutputArea.prototype._should_scroll = function(lines) {return false;}

<IPython.core.display.Javascript object>

#### Show protein-protein interactions within cutoff distance  (query = orange, target = blue)

In [14]:
def view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, target_groups, distance=4.5):
    
    def view3d(i=0):
        
        print(f"PDB: {structure_ids[i]}, query: {query_chain_ids[i]}, target: {target_chain_ids[i]}")

        target = {'chain': target_chain_ids[i], 'resi': target_groups[i]}
           
        viewer = py3Dmol.view(query='pdb:' + structure_ids[i], width=600, height=600)
        viewer.setStyle({})

        viewer.setStyle({'chain': query_chain_ids[i]}, {'line': {'colorscheme': 'orangeCarbon'}})
        viewer.setStyle({'chain' : query_chain_ids[i], 'within':{'distance' : distance, 'sel':{'chain': target_chain_ids[i]}}}, {'sphere': {'colorscheme': 'orangeCarbon'}});                   
        viewer.setStyle({'chain': target_chain_ids[i]}, {'line': {'colorscheme': 'lightblueCarbon'}})
        viewer.setStyle(target, {'stick': {'colorscheme': 'lightblueCarbon'}})
        viewer.zoomTo(target)

        return viewer.show()

    s_widget = IntSlider(min=0, max=len(structure_ids)-1, description='Structure', continuous_update=False)
    return interact(view3d, i=s_widget)

In [15]:
view_protein_protein_interactions(structure_ids, query_chain_ids, target_chain_ids, \
                                  target_groups, distance=distance_cutoff);

interactive(children=(IntSlider(value=0, continuous_update=False, description='Structure', max=14), Output()),…

In [16]:
spark.stop()