## Approved Drug Target Drug-Set Library
### Drug-set labels: Entrez Gene Symbols
#### ALL DATABASES ACCESSED 08/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import pandas as pd
from collections import defaultdict
import numpy as np
import os
import json
import csv

In [2]:
os.chdir('../../../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../notebooks/Drugbank/Approved')

### Matching approved drug names to Entrez Gene Symbols
#### Input File : drugbank_approved_target_polypeptide_ids.csv (https://www.drugbank.ca/releases/latest#protein-identifiers)
#### Downloaded 08/01/2019

In [3]:
# Import all protein names and ids matched to drugbank drugs #
df = pd.read_csv('input/drugbank_approved_target_polypeptide_ids.csv',
                usecols = ['Gene Name', 'Species', 'Drug IDs'])

In [4]:
df.head()

Unnamed: 0,Gene Name,Species,Drug IDs
0,ftsI,Haemophilus influenzae (strain ATCC 51907 / DS...,DB00303
1,HDC,Humans,DB00114
2,GLS2,Humans,DB00142
3,F13A1,Humans,DB01839; DB11300; DB11311; DB11571; DB11572; D...
4,NOS2,Humans,DB00155; DB01110; DB01234; DB08814; DB09237; D...


In [5]:
# Dropping all non-human gene names #
df = df[df['Species'].str.contains('Humans', na = False)]

In [6]:
len(df)

2471

### Validating genes using lookup table
#### Lookup table generated from ftp://ftp.ncbi.nih.gov/gene/DATA/GENE_INFO/Mammalia

In [7]:
gene_resolver(df, columnName = 'Gene Name')

In [8]:
len(df)

2458

In [9]:
df.head()

Unnamed: 0,Gene Name,Species,Drug IDs,Approved Symbol
1,HDC,Humans,DB00114,HDC
2,GLS2,Humans,DB00142,GLS2
3,F13A1,Humans,DB01839; DB11300; DB11311; DB11571; DB11572; D...,F13A1
4,NOS2,Humans,DB00155; DB01110; DB01234; DB08814; DB09237; D...,NOS2
5,HSD17B2,Humans,DB00157; DB00783; DB13952; DB13953; DB13954; D...,HSD17B2


In [10]:
# Splitting "; " separated drug IDs into separate rows #
df_target = pd.DataFrame(df['Drug IDs'].str.split('; ').tolist(), index = df['Approved Symbol']).stack()
df_target = df_target.reset_index()[[0, 'Approved Symbol']]
df_target.columns = ['DrugBank ID','Gene']

In [11]:
df_target.head()

Unnamed: 0,DrugBank ID,Gene
0,DB00114,HDC
1,DB00142,GLS2
2,DB01839,F13A1
3,DB11300,F13A1
4,DB11311,F13A1


### Associate each DrugBank ID with InChI Key

In [12]:
# Import Drugbank mapping file
drugbank_mapping = pd.read_csv('../../../metadata/drugmonizome_metadata.tsv', sep = '\t', usecols = ['DrugBank ID',
                                                                                                 'Standard InChI Key'])

In [13]:
drugbank_mapping.head()

Unnamed: 0,DrugBank ID,Standard InChI Key
0,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N
2,DB00014,BLCLNMBMMGCOAS-URPVMXJPSA-N
3,DB00027,NDAYQJDHGXTBJL-MWWSRJDJSA-N
4,DB00035,NFLWUMRGJYTJIN-PNIOQBSNSA-N


In [14]:
df_target = df_target.merge(drugbank_mapping)

In [15]:
df_target.head(3)

Unnamed: 0,DrugBank ID,Gene,Standard InChI Key
0,DB00114,HDC,NGVDGCNFYWLIFO-UHFFFAOYSA-N
1,DB00114,PYGL,NGVDGCNFYWLIFO-UHFFFAOYSA-N
2,DB00114,CSAD,NGVDGCNFYWLIFO-UHFFFAOYSA-N


In [16]:
# Creating list of gene names and drug IDs #
genes = df_target['Gene'].tolist()
drugs = df_target['Standard InChI Key'].tolist()

### Creating drugsetlibrary and exporting

In [17]:
# The input file contains duplicate protein ids matched to unique Drugbank accession numbers #
# Tupelizing protein ids and drugbank accession numbers and grouping all corresponding drugbank accession numbers under one common dictionary key #

id_dict = tuple(zip(genes, drugs))

drugsetlibrary = defaultdict(list)
for k, v in id_dict:
    drugsetlibrary[k].append(v)

In [18]:
# Removing all duplicates and terms paired with less than 5 drugs 
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}

In [19]:
os.chdir('../../../data/Drugbank')

In [20]:
# Export drugsetlibrary in gmt format
gmt_formatter(drugsetlibrary, 'Drugbank_approved_target_drugsetlibrary.gmt')

### Library counts

In [21]:
library_counts(drugsetlibrary)

1369 unique drugs
329 unique association terms
5935 unique associations
18.03951367781155 average drugs per term
