## PharmGKB SNP Drug-Set Library
### Drug-set labels: Single-Nucleotide Polymorphisms
#### ALL DATABASES ACCESSED 10/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
from collections import defaultdict
import csv
import numpy as np
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/PharmGKB')

### Importing metadata of all drugs from PharmGKB to ensure each chemical has a unique PubChem ID 
#### INPUT FILES : chemicals.tsv (https://www.pharmgkb.org/downloads)

In [3]:
df_chemicals = pd.read_csv('input/chemicals.tsv', delimiter = '\t', usecols = ['PharmGKB Accession Id',
                                                                              'PubChem Compound Identifiers'])
df_chemicals = df_chemicals.rename(columns = {'PharmGKB Accession Id':'Entity2_id',
                                             'PubChem Compound Identifiers':'pubchem_id'})

In [4]:
df_chemicals.head()

Unnamed: 0,Entity2_id,pubchem_id
0,PA166131343,135564825
1,PA166178312,22267
2,PA166177606,71316608
3,PA166178554,643985
4,PA166165069,15593908


In [5]:
# Removing any rows with a null value in the PubChem Compound Identifier column #
df_chemicals = df_chemicals[pd.notnull(df_chemicals['pubchem_id'])]
df_chemicals = df_chemicals[~df_chemicals['pubchem_id'].str.contains(',')]
df_chemicals['pubchem_id'] = df_chemicals['pubchem_id'].astype(int)

### Importing tsv file of variant - chemical relationships
#### Input file : relationships.tsv (https://www.pharmgkb.org/downloads)

In [6]:
df = pd.read_csv('input/relationships.tsv', delimiter = '\t')

In [7]:
df.head()

Unnamed: 0,Entity1_id,Entity1_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PK,PD,PMIDs
0,PA166181185,CYP2D6*14,Haplotype,PA131301952,gefitinib,Chemical,VariantAnnotation,associated,,PD,26323212
1,PA166181185,CYP2D6*14,Haplotype,PA134688071,atomoxetine,Chemical,GuidelineAnnotation,associated,,,30801677
2,PA166181185,CYP2D6*14,Haplotype,PA161925594,tropisetron,Chemical,GuidelineAnnotation,associated,,,28002639
3,PA166181185,CYP2D6*14,Haplotype,PA164925725,Toxic liver disease,Disease,VariantAnnotation,associated,,PD,26323212
4,PA166181185,CYP2D6*14,Haplotype,PA166110256,bufuralol,Chemical,"ClinicalAnnotation,VariantAnnotation",associated,PK,,11470994;11950793;12152006;17470523;18784265;1...


In [8]:
# Retaining relevant associations
df_snps = df[(df.Entity1_type == 'Haplotype') | (df.Entity1_type == 'Variant')]
df_snps = df_snps[df_snps.Entity2_type == 'Chemical']
df_snps = df_snps[df_snps.Association == 'associated']

In [9]:
# Appending pubchem IDs
df_snps = df.merge(df_chemicals)

### Importing Drugbank mapping file

In [10]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')

In [11]:
drugbank_mapping.head()

Unnamed: 0,drugbank_id,pubchem_id,inchi_key
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N
3,DB00006,134827539,OIRCOABEOLEUMC-GEJPAHFPSA-N
4,DB00006,137167482,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [12]:
# Matching drugbank IDs based on pubchem IDs
df_snps = drugbank_mapping.merge(df_snps)

In [13]:
df_snps.head()

Unnamed: 0,drugbank_id,pubchem_id,inchi_key,Entity1_id,Entity1_name,Entity1_type,Entity2_id,Entity2_name,Entity2_type,Evidence,Association,PK,PD,PMIDs
0,DB00007,3911,GFIJNRVAKGFPGQ-LIJARHBVSA-N,PA165816582,CYP2D6*10,Haplotype,PA450203,leuprolide,Chemical,VariantAnnotation,not associated,,PD,
1,DB00007,3911,GFIJNRVAKGFPGQ-LIJARHBVSA-N,PA165816576,CYP2D6*1,Haplotype,PA450203,leuprolide,Chemical,VariantAnnotation,not associated,,PD,
2,DB00007,3911,GFIJNRVAKGFPGQ-LIJARHBVSA-N,PA128,CYP2D6,Gene,PA450203,leuprolide,Chemical,VariantAnnotation,not associated,,PD,
3,DB00007,3911,GFIJNRVAKGFPGQ-LIJARHBVSA-N,PA165816579,CYP2D6*4,Haplotype,PA450203,leuprolide,Chemical,VariantAnnotation,not associated,,PD,
4,DB00014,47725,BLCLNMBMMGCOAS-URPVMXJPSA-N,PA27886,ESR2,Gene,PA164747674,goserelin,Chemical,LabelAnnotation,associated,,,


### Creating drug-set library

In [14]:
variant = df_snps['Entity1_name'].tolist()
chemicals = df_snps['inchi_key'].tolist()

In [15]:
variant_dict = tuple(zip(variant, chemicals))

drugsetlibrary = defaultdict(list)
for k,v in variant_dict:
    drugsetlibrary[k].append(v)

In [16]:
len(drugsetlibrary)

6141

In [17]:
# Removing all terms paired with less than 5 drugs #
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}

### Library counts

In [18]:
library_counts(drugsetlibrary)

811 unique drugs
1226 unique association terms
14965 unique associations
12.206362153344209 average drugs per term


### Exporting the drug-set library in GMT format

In [19]:
os.chdir('../../data/PharmGKB')

In [20]:
gmt_formatter(drugsetlibrary, 'PharmGKB_snp_drugsetlibrary.gmt')