## Kinome Scan Drug-Set Library
### Drug-Set Labels : Kinases
#### Author : Daniel Clarke (daniel.clarke@mssm.edu) & Eryk Kropiwnicki (eryk.kropiwnicki@icahn.mssm.edu)
#### Kinome Scan data was processed and transformed into GMT file by Daniel Clarke

In [1]:
import csv
import os
import pandas as pd
import time 
import requests
import json
import numpy as np

In [2]:
os.chdir('../../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../notebooks/KinomeScan')

### Harmonizing PubChemIDs with DrugBank IDs

In [3]:
# creates a matrix of drug associations
df = pd.DataFrame()
attributes = []
drugs = []

with open('input/kinomescan.T.pubchemid.gmt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    for line in reader:
        for g in line[2:]:
            attributes.append(line[0])
            drugs.append(int(g))

df['attributes'] = attributes
df['pubchem_id'] = drugs

In [4]:
df.head()

Unnamed: 0,attributes,pubchem_id
0,PCTK3,16747683
1,PCTK3,44139710
2,PCTK3,25222038
3,PCTK3,71576671
4,PCTK3,46191454


In [5]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/pubchem.tsv', sep = '\t')

In [6]:
drugbank_mapping.head()

Unnamed: 0,drugbank_id,pubchem_id,inchi_key
0,DB00006,101041682,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00006,126480209,OIRCOABEOLEUMC-GEJPAHFPSA-N
2,DB00006,132229728,OIRCOABEOLEUMC-GEJPAHFPSA-N
3,DB00006,134827539,OIRCOABEOLEUMC-GEJPAHFPSA-N
4,DB00006,137167482,OIRCOABEOLEUMC-GEJPAHFPSA-N


In [7]:
# Mapping pubchem IDs to DrugBank IDs
df = df.merge(drugbank_mapping)

In [8]:
df.head()

Unnamed: 0,attributes,pubchem_id,drugbank_id,inchi_key
0,PCTK3,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N
1,YANK1,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N
2,RSK4,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N
3,MARK2,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N
4,ICK,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N


### Validating all gene names

In [9]:
gene_resolver(df,columnName = 'attributes')

In [10]:
df.head()

Unnamed: 0,attributes,pubchem_id,drugbank_id,inchi_key,Approved Symbol
0,PCTK3,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N,CDK18
1,YANK1,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N,STK32A
2,RSK4,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N,RPS6KA6
3,MARK2,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N,MARK2
4,ICK,46191454,DB12686,HUXYBQXJVXOMKX-UHFFFAOYSA-N,CILK1


In [11]:
len(df)

3567

### Creating drugsetlibrary and exporting

In [12]:
# Creating list of gene names and drug IDs #
genes = df['Approved Symbol'].tolist()
drugs = df['inchi_key'].tolist()

In [13]:
# The input file contains duplicate protein ids matched to unique Drugbank accession numbers #
# Tupelizing protein ids and drugbank accession numbers and grouping all corresponding drugbank accession numbers
# under one common dictionary key 

id_dict = tuple(zip(genes, drugs))

drugsetlibrary = defaultdict(list)
for k, v in id_dict:
    drugsetlibrary[k].append(v)

In [14]:
# Employing a looser threshold of set-size of at least 3 because of the small library size
drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(v) > 3} 

In [15]:
os.chdir('../../data/KinomeScan')

In [16]:
# Export drugsetlibrary in gmt format
gmt_formatter(drugsetlibrary, 'KinomeScan_kinase_drugsetlibrary.gmt')

### Library Counts

In [17]:
library_counts(drugsetlibrary)

54 unique drugs
301 unique association terms
2810 unique associations
9.335548172757475 average drugs per term
