## L1000FWD GO Terms Drug-Set Libraries
#### Drug-Set Labels : GO Terms
#### ALL DATABASES ACCESSED 11/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import requests
import time
from collections import defaultdict
import csv
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/L1000FWD')

### Importing pert_ids matched to EnrichrIDs, and matching pert_id to InChI Key
#### Input Files : EnrichrIDs_downregulated.tsv | EnrichrIDs_upregulated.tsv

In [3]:
inchi_lookup = pd.read_csv('input/pert_id_inchi.tsv', sep = '\t')

In [4]:
inchi_lookup.head()

Unnamed: 0,pert_id,inchi_key
0,BRD-A00546892,YSXKPIUOCJLQIE-UHFFFAOYSA-N
1,BRD-A00993607,PAZJSJFMUHDSTF-UHFFFAOYSA-N
2,BRD-A01320529,GIIZNNXWQWCKIB-UHFFFAOYSA-N
3,BRD-A01787639,HRRBJVNMSRJFHQ-UHFFFAOYSA-N
4,BRD-A01826957,DSFGXPJYDCSWTA-UHFFFAOYSA-N


In [5]:
df_up = pd.read_csv('input/EnrichrIDs_upregulated.tsv', delimiter = '\t')
df_up = inchi_lookup.merge(df_up)
id_dict_up = df_up.set_index('enrichr_id').to_dict()['inchi_key']

In [6]:
df_down = pd.read_csv('input/EnrichrIDs_downregulated.tsv', delimiter = '\t')
df_down = inchi_lookup.merge(df_down)
id_dict_down = df_down.set_index('enrichr_id').to_dict()['inchi_key']

In [7]:
def enrichr_library_generator(gene_set_library, dictionary, p_value_cutoff = 0.01):
    '''
    Queries an EnrichrID and retrieves most enriched terms from specificed geneset library

    Parameters:
    gene_set_library (str): Specify the Enrichr library to pull enriched terms from
    dictionary (dict): Dictionary of terms (e.g. drugs) associated with an EnrichrID (and in effect, a genelist/signature)

    Returns:
    drugsetlibrary (dict): dictionary of Enrichr terms matched to sets of drugs signficantly associated with the term

    '''
    
    #################################################################
    ### Querying up/downregulated drug/gene lists through Enrichr ###
    #################################################################
    
    enrichr_url = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'

    drug_list = []
    term_list = []

    for user_id, drug in dictionary.items():
        response = requests.get(enrichr_url + query_string % (user_id, gene_set_library))
        try:
            response.json()
        except ValueError:
            continue
        data = response.json()
        time.sleep(0.5)
        # Replacing JSON key with drug name for easier data manipulation later #
        data[drug] = data.pop(gene_set_library)
        # Accessing each branch of json tree
        for k,v in data.items():
            for lists in v:
                if (lists[6]) < p_value_cutoff: # using a p value of 0.01 as a strict threshold
                    drug_list.append(k)
                    term_list.append(lists[1])
    

    ####################################
    ### Creating Drug-set library ###
    ####################################

    # Creating two tupelized lists in a dictionary format #
    drug_dict = tuple(zip(term_list,drug_list))
    # Creating a dictionary where a list of values are matched under their corresponding key #
    drugsetlibrary = defaultdict(list)
    for k,v in drug_dict:
        drugsetlibrary[k].append(v)

    # Removing all terms paired with less than 5 drugs #
    drugsetlibrary = {k : list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}
    return drugsetlibrary

#### For all drug-term associations I use a p-value cut-off of 0.01

### Biological Processes drug-set libraries

In [8]:
os.chdir('../../data/L1000FWD')

In [9]:
# Upregulated 
GO_BP_drugsetlibrary_up = enrichr_library_generator('GO_Biological_Process_2018', id_dict_up)

In [10]:
# Downregulated 
GO_BP_drugsetlibrary_down = enrichr_library_generator('GO_Biological_Process_2018', id_dict_down)

In [11]:
# Exporting
gmt_formatter(GO_BP_drugsetlibrary_up, 'L1000FWD_GO_Biological_Processes_drugsetlibrary_up.txt')
gmt_formatter(GO_BP_drugsetlibrary_down, 'L1000FWD_GO_Biological_Processes_drugsetlibrary_down.txt')

### Biological Processes Library Counts

In [12]:
library_counts(GO_BP_drugsetlibrary_up)

4195 unique drugs
1228 unique association terms
71261 unique associations
58.03013029315961 average drugs per term


In [13]:
library_counts(GO_BP_drugsetlibrary_down)

4013 unique drugs
1068 unique association terms
54525 unique associations
51.05337078651685 average drugs per term


### Cellular Component drug-set libraries

In [14]:
# Upregulated 
GO_CC_drugsetlibrary_up = enrichr_library_generator('GO_Cellular_Component_2018', id_dict_up)

# Downregulated 
GO_CC_drugsetlibrary_down = enrichr_library_generator('GO_Cellular_Component_2018', id_dict_down)

In [15]:
# Exporting
gmt_formatter(GO_CC_drugsetlibrary_up, 'L1000FWD_GO_Cellular_Component_drugsetlibrary_up.txt')
gmt_formatter(GO_CC_drugsetlibrary_down, 'L1000FWD_GO_Cellular_Component_drugsetlibrary_down.txt')

### Cellular Component Library Counts

In [16]:
library_counts(GO_CC_drugsetlibrary_up)

3366 unique drugs
153 unique association terms
15587 unique associations
101.87581699346406 average drugs per term


In [17]:
library_counts(GO_CC_drugsetlibrary_down)

3246 unique drugs
157 unique association terms
15830 unique associations
100.828025477707 average drugs per term


### Molecular Function drug-set libraries

In [19]:
# Upregulated 
GO_MF_drugsetlibrary_up = enrichr_library_generator('GO_Molecular_Function_2018', id_dict_up)

# Downregulated 
GO_MF_drugsetlibrary_down = enrichr_library_generator('GO_Molecular_Function_2018', id_dict_down)

In [20]:
# Exporting
gmt_formatter(GO_MF_drugsetlibrary_up, 'L1000FWD_GO_Molecular_Function_drugsetlibrary_up.txt')
gmt_formatter(GO_MF_drugsetlibrary_down, 'L1000FWD_GO_Molecular_Function_drugsetlibrary_down.txt')

### Molecular Function Library Counts

In [21]:
library_counts(GO_MF_drugsetlibrary_up)

2427 unique drugs
183 unique association terms
10389 unique associations
56.77049180327869 average drugs per term


In [22]:
library_counts(GO_MF_drugsetlibrary_down)

2158 unique drugs
158 unique association terms
7674 unique associations
48.56962025316456 average drugs per term
