## CREEDS Signatures Drug-Set Library
### Drug-set labels: Genes
#### ALL DATABASES ACCESSED 03/01/20
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import os
import csv
import json
import pandas as pd
from collections import defaultdict

In [2]:
os.chdir('../../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../notebooks/CREEDS')

### Import CREEDS data in json format
#### Source : https://amp.pharm.mssm.edu/CREEDS/#downloads

In [3]:
with open('input/single_drug_perturbations-v1.0.json', 'r') as f:
    data = json.load(f)

In [4]:
data[0].keys()

dict_keys(['smiles', 'cell_type', 'pert_ids', 'platform', 'drugbank_id', 'curator', 'geo_id', 'pubchem_cid', 'drug_name', 'version', 'ctrl_ids', 'down_genes', 'up_genes', 'organism', 'id'])

In [5]:
data[0]

{'smiles': 'C1=C(C(=O)NC(=O)N1)F',
 'cell_type': 'Bone marrow Sca+ SP hematopoeitic stem cells (HSC) - 10 Day',
 'pert_ids': ['GSM26744', 'GSM26745'],
 'platform': 'GPL81',
 'drugbank_id': 'DB00544',
 'curator': 'cadimo',
 'geo_id': 'GSE1559',
 'pubchem_cid': 3385,
 'drug_name': 'Fluorouracil',
 'version': '1.0',
 'ctrl_ids': ['GSM26734', 'GSM26735'],
 'down_genes': [['Hmgn1', -0.17981423437595367],
  ['Gm10260', -0.14166636765003204],
  ['Npm1', -0.13669142127037048],
  ['Fosb', -0.1250605285167694],
  ['Gnb2l1', -0.12024638056755066],
  ['Jund', -0.10381214320659637],
  ['Ldha', -0.09215717762708664],
  ['Eif4a2', -0.09049281477928162],
  ['Igkv6-23', -0.08969537913799286],
  ['Ccnd2', -0.08510024100542068],
  ['Saraf', -0.08052777498960495],
  ['Cd164', -0.07728368788957596],
  ['AU020206', -0.07690908014774323],
  ['Gm6793', -0.0701846107840538],
  ['Ptma', -0.06753632426261902],
  ['Serpina3g', -0.06303614377975464],
  ['Ctnnb1', -0.06275594234466553],
  ['Hspa9', -0.0621070601046

### Generate up/down drug-set libraries

In [12]:
# Import Drugbank mapping file
drugbank_mapping = pd.read_csv('../../metadata/drugmonizome_metadata.tsv', sep = '\t', usecols = ['DrugBank ID',
                                                                                                 'Standard InChI Key'])
drugbank_mapping = drugbank_mapping.rename(columns = {'DrugBank ID': 'drugbank_id'})

In [13]:
drugbank_mapping.head()

Unnamed: 0,drugbank_id,Standard InChI Key
0,DB00006,OIRCOABEOLEUMC-GEJPAHFPSA-N
1,DB00007,GFIJNRVAKGFPGQ-LIJARHBVSA-N
2,DB00014,BLCLNMBMMGCOAS-URPVMXJPSA-N
3,DB00027,NDAYQJDHGXTBJL-MWWSRJDJSA-N
4,DB00035,NFLWUMRGJYTJIN-PNIOQBSNSA-N


In [14]:
# In this function specify organism and up/down gene list from CREEDS json data dump

def libraryGenerator(data, organism = 'human', gene_specification = 'down_genes'):
    edgelist = []

    for entry in data:
        drug = entry['drugbank_id']
        if entry['organism'] == organism: # specify organism type
            gene_list = []
            drugbank_list = []
            for gene in entry[gene_specification]: # specify up_genes or down_genes
                gene_list.append(gene[0])
                drugbank_list.append(drug)
            edgelist.extend(zip(drugbank_list,gene_list)) # create edgelist of drug-gene interactions
            
    df = pd.DataFrame(data = edgelist, columns = ['drugbank_id','gene']) # create df of drug-gene edge list
    
    gene_resolver(df, columnName = 'gene') # resolve CREEDS gene names to approved symbols
    
    df = df.merge(drugbank_mapping)
    
    id_list = tuple(zip(df['Approved Symbol'].tolist(),df['Standard InChI Key'].tolist()))
    
    drugsetlibrary = defaultdict(list)
    for k,v in id_list:
        drugsetlibrary[k].append(v)
    
    drugsetlibrary = {k:list(set(v)) for k,v in drugsetlibrary.items() if len(set(v)) >= 5} # removing sets < 5
    
    
    return drugsetlibrary

#### Downregulated genes

In [15]:
drugsetlibrary_down = libraryGenerator(data,organism = 'human',gene_specification = 'down_genes')

In [16]:
library_counts(drugsetlibrary_down)

72 unique drugs
2532 unique association terms
29782 unique associations
11.762243285939968 average drugs per term


In [17]:
gmt_formatter(drugsetlibrary_down, '../../data/CREEDS/CREEDS_human_drugsetlibrary_down.gmt')

#### Upregulated genes

In [18]:
drugsetlibrary_up = libraryGenerator(data, organism = 'human', gene_specification = 'up_genes')

In [19]:
library_counts(drugsetlibrary_up)

71 unique drugs
2535 unique association terms
29603 unique associations
11.677712031558185 average drugs per term


In [20]:
gmt_formatter(drugsetlibrary_up, '../../data/CREEDS/CREEDS_human_drugsetlibrary_up.gmt')