# PDB Drug Bank Mapping

Join PDB dataset and Drug Bank dataset together by their InChIKey

## Imports and variables

In [1]:
from pyspark import SparkConf, SparkContext                    
from mmtfPyspark.datasets import customReportService, drugBankDataset
                                                               
# Create variables                                             
APP_NAME = "MMTF_Spark"                                        
path = "../../resources/mmtf_full_sample/"

# Configure Spark                                              
conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]")  
sc = SparkContext(conf=conf)                                   

## Download open DrugBank dataset

In [29]:
drugBank = drugBankDataset.get_open_drug_links()
drugBank.head(5)

[Row(DrugBankID='DB00001', AccessionNumbers='BTD00024 | BIOD00024', Commonname='Lepirudin', CAS='138068-37-8', UNII='Y43GF64R34', Synonyms='Hirudin variant-1 | Lepirudin recombinant', StandardInChIKey=None),
 Row(DrugBankID='DB00002', AccessionNumbers='BTD00071 | BIOD00071', Commonname='Cetuximab', CAS='205923-56-4', UNII='PQX0D8J21J', Synonyms='Cetuximab | Cétuximab | Cetuximabum | Immunoglobulin G 1 (human-mouse monoclonal C 225 gamma 1 - chain anti-human epidermal growt factor receptor), disulfide wit human-mouse monoclonal C 225 kappa - chain, dimer', StandardInChIKey=None),
 Row(DrugBankID='DB00003', AccessionNumbers='BTD00001 | BIOD00001', Commonname='Dornase alfa', CAS='143831-71-4', UNII='953A26OA1Y', Synonyms='Deoxyribonuclease (human clone 18-1 protein moiety) | Dornase alfa, recombinant | Dornase alpha | Recombinant deoxyribonuclease (DNAse)', StandardInChIKey=None),
 Row(DrugBankID='DB00004', AccessionNumbers='BTD00084 | BIOD00084', Commonname='Denileukin diftitox', CAS='17

## Filter out DrugBank entries without StandardInChIKey

In [30]:
drugBank = drugBank.filter(drugBank.StandardInChIKey.isNotNull())
drugBank.head(5)

[Row(DrugBankID='DB00006', AccessionNumbers='BTD00076 | EXPT03302 | BIOD00076 | DB02351', Commonname='Bivalirudin', CAS='128270-60-0', UNII='TN9BEX005G', Synonyms='Bivalirudina | Bivalirudinum | Hirulog', StandardInChIKey='OIRCOABEOLEUMC-GEJPAHFPSA-N'),
 Row(DrugBankID='DB00014', AccessionNumbers='BTD00113 | BIOD00113', Commonname='Goserelin', CAS='65807-02-5', UNII='0F65R8P09N', Synonyms=None, StandardInChIKey='BLCLNMBMMGCOAS-URPVMXJPSA-N'),
 Row(DrugBankID='DB00027', AccessionNumbers='BTD00036 | BIOD00036', Commonname='Gramicidin D', CAS='1405-97-6', UNII='5IE62321P4', Synonyms='Bacillus brevis gramicidin D | Gramicidin | Gramicidin A | Gramicidin B | Gramicidin C | Gramicidine', StandardInChIKey='NDAYQJDHGXTBJL-MWWSRJDJSA-N'),
 Row(DrugBankID='DB00035', AccessionNumbers='BTD00112 | BTD00061 | BIOD00112 | BIOD00061', Commonname='Desmopressin', CAS='16679-58-6', UNII='ENR1LLB0FP', Synonyms='1-(3-mercaptopropionic acid)-8-D-arginine-vasopressin | 1-deamino-8-D-arginine vasopressin | 1-

## Get PDB ligand annotations

In [31]:
ligands = customReportService.get_dataset(["ligandId","ligandMolecularWeight","ligandFormula","ligandSmiles","InChIKey"])
ligands.head(10)

[Row(structureChainId='100D.A', structureId='100D', chainId='A', ligandId='SPM', ligandMolecularWeight=202.34, ligandFormula='C10 H26 N4', ligandSmiles='C(CCNCCCN)CNCCCN', InChIKey='PFNFFQXMRSDOHW-UHFFFAOYSA-N'),
 Row(structureChainId='100D.B', structureId='100D', chainId='B', ligandId=None, ligandMolecularWeight=None, ligandFormula=None, ligandSmiles=None, InChIKey=None),
 Row(structureChainId='101D.A', structureId='101D', chainId='A', ligandId='CBR', ligandMolecularWeight=386.09, ligandFormula='C9 H13 Br N3 O7 P', ligandSmiles='C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(=O)(O)O)O', InChIKey='PLDRCXOBLRYJSZ-RRKCRQDMSA-N'),
 Row(structureChainId='101D.A', structureId='101D', chainId='A', ligandId='MG', ligandMolecularWeight=24.31, ligandFormula='Mg 2', ligandSmiles='[Mg+2]', InChIKey='JLVVSXFLKOJNIY-UHFFFAOYSA-N'),
 Row(structureChainId='101D.B', structureId='101D', chainId='B', ligandId='CBR', ligandMolecularWeight=386.09, ligandFormula='C9 H13 Br N3 O7 P', ligandSmiles='C1[C@@H]([

## Filter out DrugBank entries without InChIKey

In [32]:
ligands = ligands.filter(ligands.InChIKey.isNotNull())
ligands.head(5)

[Row(structureChainId='100D.A', structureId='100D', chainId='A', ligandId='SPM', ligandMolecularWeight=202.34, ligandFormula='C10 H26 N4', ligandSmiles='C(CCNCCCN)CNCCCN', InChIKey='PFNFFQXMRSDOHW-UHFFFAOYSA-N'),
 Row(structureChainId='101D.A', structureId='101D', chainId='A', ligandId='CBR', ligandMolecularWeight=386.09, ligandFormula='C9 H13 Br N3 O7 P', ligandSmiles='C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(=O)(O)O)O', InChIKey='PLDRCXOBLRYJSZ-RRKCRQDMSA-N'),
 Row(structureChainId='101D.A', structureId='101D', chainId='A', ligandId='MG', ligandMolecularWeight=24.31, ligandFormula='Mg 2', ligandSmiles='[Mg+2]', InChIKey='JLVVSXFLKOJNIY-UHFFFAOYSA-N'),
 Row(structureChainId='101D.B', structureId='101D', chainId='B', ligandId='CBR', ligandMolecularWeight=386.09, ligandFormula='C9 H13 Br N3 O7 P', ligandSmiles='C1[C@@H]([C@H](O[C@H]1N2C=C(C(=NC2=O)N)Br)COP(=O)(O)O)O', InChIKey='PLDRCXOBLRYJSZ-RRKCRQDMSA-N'),
 Row(structureChainId='101D.B', structureId='101D', chainId='B', ligandId=

## Join ligand dataset with DrugBank info by InChIKey

In [33]:
ligands = ligands.join(drugBank, ligands.InChIKey == drugBank.StandardInChIKey)
ligands.head(10)

[Row(structureChainId='100D.A', structureId='100D', chainId='A', ligandId='SPM', ligandMolecularWeight=202.34, ligandFormula='C10 H26 N4', ligandSmiles='C(CCNCCCN)CNCCCN', InChIKey='PFNFFQXMRSDOHW-UHFFFAOYSA-N', DrugBankID='DB00127', AccessionNumbers='NUTR00055 | EXPT02947 | DB02564', Commonname='Spermine', CAS='71-44-3', UNII='2FZ7Y3VOQX', Synonyms="4,9-Diaza-1,12-dodecanediamine | 4,9-Diazadodecane-1,12-diamine | N,N'-Bis(3-aminopropyl)-1,4-butanediamine | Spermine", StandardInChIKey='PFNFFQXMRSDOHW-UHFFFAOYSA-N'),
 Row(structureChainId='101D.A', structureId='101D', chainId='A', ligandId='MG', ligandMolecularWeight=24.31, ligandFormula='Mg 2', ligandSmiles='[Mg+2]', InChIKey='JLVVSXFLKOJNIY-UHFFFAOYSA-N', DrugBankID='DB01378', AccessionNumbers=None, Commonname='Magnesium', CAS='7439-95-4', UNII='T6V3LHY838', Synonyms=None, StandardInChIKey='JLVVSXFLKOJNIY-UHFFFAOYSA-N'),
 Row(structureChainId='101M.A', structureId='101M', chainId='A', ligandId='NBN', ligandMolecularWeight=83.13, liga

## Show one exmaple per drug molecule

In [34]:
ligands = ligands.dropDuplicates(["Commonname"])
ligands = ligands.select("structureChainId", "ligandId", "DrugBankID", "Commonname", "ligandMolecularWeight","ligandFormula", "InChIKey", "ligandSmiles")
ligands.sort("Commonname").show(10)

+----------------+--------+----------+--------------------+---------------------+---------------+--------------------+--------------------+
|structureChainId|ligandId|DrugBankID|          Commonname|ligandMolecularWeight|  ligandFormula|            InChIKey|        ligandSmiles|
+----------------+--------+----------+--------------------+---------------------+---------------+--------------------+--------------------+
|          1NJ6.A|     A5A|   DB03376|'5'-O-(N-(L-Alany...|                417.4|C13 H19 N7 O7 S|CWWYMWDIYBJVLP-YT...|C[C@@H](C(=O)NS(=...|
|          1NJ5.A|     P5A|   DB02510|'5'-O-(N-(L-Proly...|               443.43|C15 H21 N7 O7 S|LKVJEMXWEODCAY-JV...|c1nc(c2c(n1)n(cn2...|
|          1VQ2.A|     DDN|   DB04280|((2r,3s,5r)-3-Hyd...|                310.2| C9 H15 N2 O8 P|ILSIYJVILUIVPM-LX...|C1[C@@H]([C@H](O[...|
|          1A7A.A|     ADC|   DB03216|(1'r,2's)-9-(2-Hy...|               233.23|  C10 H11 N5 O2|RQPALADHFYHEHK-CH...|c1nc(c2c(n1)n(cn2...|
|          1DCY.A|  

## Terminate Spark

In [17]:
sc.stop()