## L1000FWD KEGG Pathways Drug-Set Libraries
#### Drug-Set Labels : KEGG Pathways
#### ALL DATABASES ACCESSED 11/2019
##### Author : Eryk Kropiwnicki | eryk.kropiwnicki@icahn.mssm.edu

In [1]:
import json
import pandas as pd
import requests
import time
from collections import defaultdict
import csv
import os

In [2]:
os.chdir('../../scripts')
from export_script import *
os.chdir('../notebooks/L1000FWD')

### Importing Drugs matched to EnrichrIDs for querying through Enrichr API
#### Input Files : EnrichrIDs_downregulated.tsv | EnrichrIDs_upregulated.tsv

In [3]:
inchi_lookup = pd.read_csv('input/pert_id_inchi.tsv', sep = '\t')

In [4]:
inchi_lookup.head()

Unnamed: 0,pert_id,inchi_key
0,BRD-A00546892,YSXKPIUOCJLQIE-UHFFFAOYSA-N
1,BRD-A00993607,PAZJSJFMUHDSTF-UHFFFAOYSA-N
2,BRD-A01320529,GIIZNNXWQWCKIB-UHFFFAOYSA-N
3,BRD-A01787639,HRRBJVNMSRJFHQ-UHFFFAOYSA-N
4,BRD-A01826957,DSFGXPJYDCSWTA-UHFFFAOYSA-N


In [5]:
df_up = pd.read_csv('input/EnrichrIDs_upregulated.tsv', delimiter = '\t')
df_up = inchi_lookup.merge(df_up)
id_dict_up = df_up.set_index('enrichr_id').to_dict()['inchi_key']
df_up.head()

Unnamed: 0,pert_id,inchi_key,enrichr_id
0,BRD-A00546892,YSXKPIUOCJLQIE-UHFFFAOYSA-N,26921285
1,BRD-A00993607,PAZJSJFMUHDSTF-UHFFFAOYSA-N,26920584
2,BRD-A01320529,GIIZNNXWQWCKIB-UHFFFAOYSA-N,26921085
3,BRD-A01787639,HRRBJVNMSRJFHQ-UHFFFAOYSA-N,26918480
4,BRD-A01826957,DSFGXPJYDCSWTA-UHFFFAOYSA-N,26917074


In [6]:
df_down = pd.read_csv('input/EnrichrIDs_downregulated.tsv', delimiter = '\t')
df_down = inchi_lookup.merge(df_down)
id_dict_down = df_down.set_index('enrichr_id').to_dict()['inchi_key']
df_down.head()

Unnamed: 0,pert_id,inchi_key,enrichr_id
0,BRD-A00546892,YSXKPIUOCJLQIE-UHFFFAOYSA-N,26913721
1,BRD-A00993607,PAZJSJFMUHDSTF-UHFFFAOYSA-N,26912264
2,BRD-A01320529,GIIZNNXWQWCKIB-UHFFFAOYSA-N,26913302
3,BRD-A01787639,HRRBJVNMSRJFHQ-UHFFFAOYSA-N,26908404
4,BRD-A01826957,DSFGXPJYDCSWTA-UHFFFAOYSA-N,26906828


In [7]:
def enrichr_library_generator(gene_set_library, dictionary, p_value_cutoff = 0.01):
    '''
    Queries an EnrichrID and retrieves most enriched terms from specificed geneset library

    Parameters:
    gene_set_library (str): Specify the Enrichr library to pull enriched terms from
    dictionary (dict): Dictionary of terms (e.g. drugs) associated with an EnrichrID (and in effect, a genelist/signature)

    Returns:
    drugsetlibrary (dict): dictionary of Enrichr terms matched to sets of drugs signficantly associated with the term

    '''
    
    #################################################################
    ### Querying up/downregulated drug/gene lists through Enrichr ###
    #################################################################
    
    enrichr_url = 'http://amp.pharm.mssm.edu/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'

    drug_list = []
    term_list = []

    for user_id, drug in dictionary.items():
        response = requests.get(enrichr_url + query_string % (user_id, gene_set_library))
        try:
            response.json()
        except ValueError:
            continue
        data = response.json()
        time.sleep(0.5)
        # Replacing JSON key with drug name for easier data manipulation later #
        data[drug] = data.pop(gene_set_library)
        # Accessing each branch of json tree
        for k,v in data.items():
            for lists in v:
                if (lists[6]) < p_value_cutoff: # using a p value of 0.01 as a strict threshold
                    drug_list.append(k)
                    term_list.append(lists[1])
    

    ####################################
    ### Creating Drug-set library ###
    ####################################

    # Creating two tupelized lists in a dictionary format #
    drug_dict = tuple(zip(term_list,drug_list))
    # Creating a dictionary where a list of values are matched under their corresponding key #
    drugsetlibrary = defaultdict(list)
    for k,v in drug_dict:
        drugsetlibrary[k].append(v)

    # Removing all terms paired with less than 5 drugs #
    drugsetlibrary = {k : list(set(v)) for k,v in drugsetlibrary.items() if len(set(v))>=5}
    return drugsetlibrary

### KEGG Pathways drug-set libraries 
#### For all drug-term associations I use a corrected Enrichr p-value cut-off of 0.01

In [8]:
os.chdir('../../data/L1000FWD')

In [9]:
# Upregulated #
KEGG_drugsetlibrary_up = enrichr_library_generator('KEGG_2019_Human', id_dict_up)

In [10]:
# Downregulated #
KEGG_drugsetlibrary_down = enrichr_library_generator('KEGG_2019_Human', id_dict_down)

### Library counts

In [11]:
# upregulated
library_counts(KEGG_drugsetlibrary_up)

3662 unique drugs
245 unique association terms
29543 unique associations
120.58367346938776 average drugs per term


In [12]:
# downregulated
library_counts(KEGG_drugsetlibrary_down)

3309 unique drugs
236 unique association terms
20602 unique associations
87.29661016949153 average drugs per term


#### Exporting

In [13]:
gmt_formatter(KEGG_drugsetlibrary_up, 'L1000FWD_KEGG_Pathways_drugsetlibrary_up.txt')
gmt_formatter(KEGG_drugsetlibrary_down, 'L1000FWD_KEGG_Pathways_drugsetlibrary_down.txt')