## PharmGKB SNP Drug-Set Library
### Drug-set labels: Single-Nucleotide Polymorphisms
#### ALL DATABASES ACCESSED 10/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
from collections import defaultdict
import csv
import numpy as np
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/PharmGKB')

### Importing metadata of all drugs from PharmGKB to ensure each chemical has a unique PubChem ID 
#### INPUT FILES : chemicals.tsv (https://www.pharmgkb.org/downloads)

In [3]:
df_chemicals = pd.read_csv('input/chemicals.tsv', delimiter = '\t', usecols = ['PharmGKB Accession Id',
                                                                              'PubChem Compound Identifiers',
                                                                              'SMILES'])
df_chemicals = df_chemicals.rename(columns = {'PharmGKB Accession Id':'Entity2_id',
                                             'PubChem Compound Identifiers':'pubchem_id'})

In [4]:
df_chemicals.head()

Unnamed: 0,Entity2_id,SMILES,pubchem_id
0,PA166131343,C1[C@@H](C=C[C@@H]1N2C=NC3=C2NC(=NC3=O)N)COP(=...,135564825
1,PA166178312,,22267
2,PA166177606,,71316608
3,PA166178554,,643985
4,PA166165069,CC(C)[C@](CCCNCCC1=CC(=C(C=C1)OC)OC)(C#N)C2=CC...,15593908


In [5]:
# Removing any rows with a null value in the PubChem Compound Identifier column #
df_chemicals = df_chemicals[pd.notnull(df_chemicals['pubchem_id'])]
df_chemicals = df_chemicals[~df_chemicals['pubchem_id'].str.contains(',')]
df_chemicals['pubchem_id'] = df_chemicals['pubchem_id'].astype(int)

### Importing tsv file of variant - chemical relationships
#### Input file : relationships.tsv (https://www.pharmgkb.org/downloads)

In [6]:
df = pd.read_csv('input/relationships.tsv', delimiter = '\t')

In [7]:
df.head()

Unnamed: 0,Entity1_id,Entity1_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PK,PD,PMIDs
0,PA166181185,CYP2D6*14,Haplotype,PA131301952,gefitinib,Chemical,VariantAnnotation,associated,,PD,26323212
1,PA166181185,CYP2D6*14,Haplotype,PA134688071,atomoxetine,Chemical,GuidelineAnnotation,associated,,,30801677
2,PA166181185,CYP2D6*14,Haplotype,PA161925594,tropisetron,Chemical,GuidelineAnnotation,associated,,,28002639
3,PA166181185,CYP2D6*14,Haplotype,PA164925725,Toxic liver disease,Disease,VariantAnnotation,associated,,PD,26323212
4,PA166181185,CYP2D6*14,Haplotype,PA166110256,bufuralol,Chemical,"ClinicalAnnotation,VariantAnnotation",associated,PK,,11470994;11950793;12152006;17470523;18784265;1...


In [8]:
# Retaining relevant associations
df_snps = df[(df['Entity1_type'] == 'Haplotype') | (df['Entity1_type'] == 'Variant')]
df_snps = df_snps[df_snps['Entity2_type'] == 'Chemical']
df_snps = df_snps[df_snps['Association'] == 'associated']

In [9]:
# Appending pubchem IDs
df_snps = df_snps.merge(df_chemicals)

### Importing Drugbank mapping file

In [11]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pharmgkb.tsv', sep = '\t')
drugbank_mapping = drugbank_mapping.rename(columns = {'pharmgkb_id':'Entity2_id'})

In [12]:
drugbank_mapping.head()

Unnamed: 0,drugbank_id,Entity2_id,inchi_key
0,DB00006,PA10032,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00007,PA450203,GFIJNRVAKGFPGQ-LIJARHBVSA-N
2,DB00014,PA164747674,BLCLNMBMMGCOAS-URPVMXJPSA-N
3,DB00035,PA449237,NFLWUMRGJYTJIN-PNIOQBSNSA-N
4,DB00050,PA164764506,SBNPWPIBESPSIF-MHWMIDJBSA-N


In [13]:
# Matching drugbank IDs based on pubchem IDs
df_snps = drugbank_mapping.merge(df_snps)

In [14]:
df_snps.head()

Unnamed: 0,drugbank_id,Entity2_id,inchi_key,Entity1_id,Entity1_name,Entity1_type,Entity2_name,Entity2_type,Evidence,Association,PK,PD,PMIDs,SMILES,pubchem_id
0,DB00014,PA164747674,BLCLNMBMMGCOAS-URPVMXJPSA-N,PA166154579,rs4149056,Variant,goserelin,Chemical,VariantAnnotation,associated,,PD,,CC(C)CC(C(=O)NC(CCCN=C(N)N)C(=O)N1CCCC1C(=O)NN...,47725
1,DB00080,PA164768820,DOAKLVKFURWEDJ-RWDRXURGSA-N,PA166157284,rs1045642,Variant,daptomycin,Chemical,"ClinicalAnnotation,VariantAnnotation",associated,PK,,25239468.0,CCCCCCCCCC(=O)N[C@@H](CC1=CNC2=CC=CC=C21)C(=O)...,16129629
2,DB00091,PA449167,PMATZTZNYRCHOR-CGLBZJNRSA-N,PA166157030,rs2275913,Variant,cyclosporine,Chemical,VariantAnnotation,associated,,PD,30799725.0,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,6435893
3,DB00091,PA449167,PMATZTZNYRCHOR-CGLBZJNRSA-N,PA166155843,rs17514110,Variant,cyclosporine,Chemical,VariantAnnotation,associated,,PD,,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,6435893
4,DB00091,PA449167,PMATZTZNYRCHOR-CGLBZJNRSA-N,PA166155117,rs17264736,Variant,cyclosporine,Chemical,VariantAnnotation,associated,,PD,,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,6435893


### Creating drug-set library

In [15]:
variant = df_snps['Entity1_name'].tolist()
chemicals = df_snps['inchi_key'].tolist()

In [16]:
variant_dict = tuple(zip(variant, chemicals))

drugsetlibrary = defaultdict(list)
for k,v in variant_dict:
    drugsetlibrary[k].append(v)

In [18]:
# Removing all terms paired with less than 5 drugs #
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}

### Library counts

In [19]:
library_counts(drugsetlibrary)

483 unique drugs
554 unique association terms
5555 unique associations
10.027075812274369 average drugs per term


### Exporting the drug-set library in GMT format

In [20]:
os.chdir('../../data/PharmGKB')

In [21]:
gmt_formatter(drugsetlibrary, 'PharmGKB_snp_drugsetlibrary.gmt')