## Associating significant L1000FWD drug signatures with respective drugs
### Database : http://amp.pharm.mssm.edu/L1000FWD/

In [1]:
import pandas as pd
import os
import csv
from itertools import islice
from collections import defaultdict
import json

In [2]:
os.chdir('../../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../notebooks/L1000FWD')

In [3]:
df_sig_metadata = pd.read_csv('input/CD_signature_metadata.csv')
df_sig_metadata['pert_desc'] = df_sig_metadata['pert_desc'].str.lower()

In [4]:
df_sig_metadata.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time
0,AML001_CD34_6H:BRD-K43389675:10,0.0,AML001_CD34_6H,CD34,0.853471,daunorubicin,10.0,BRD-K43389675,6.0
1,AML001_PC3_6H:BRD-A19037878:0.37037,0.0,AML001_PC3_6H,PC3,0.505995,trichostatin_a,0.37037,BRD-A19037878,6.0
2,AML001_PC3_6H:BRD-A19037878:1.11111,0.0,AML001_PC3_6H,PC3,0.676204,trichostatin_a,1.11111,BRD-A19037878,6.0
3,AML001_PC3_6H:BRD-A19037878:10,0.0,AML001_PC3_6H,PC3,0.747633,trichostatin_a,10.0,BRD-A19037878,6.0
4,AML001_PC3_6H:BRD-A19037878:3.33333,0.0,AML001_PC3_6H,PC3,0.659851,trichostatin_a,3.33333,BRD-A19037878,6.0


In [5]:
# Creating separate files for up and down genes

with open('input/CD_signatures_binary_42809.gmt', 'r') as fin, open('input/L1000FWD_drug_up_genes.gmt', 'w') as fout:
    fout.writelines(islice(fin, 0, None, 2))
fout.close()

with open('input/CD_signatures_binary_42809.gmt', 'r') as fin, open('input/L1000FWD_drug_down_genes.gmt','w') as fout:
    fout.writelines(islice(fin, 1, None, 2))
fout.close()

### Filtering df_sig_metadata by lowest mean_cosine_dist_centered_by_batch

In [6]:
# Get the smallest mean cosine distance for each pert id #
df_filtered = df_sig_metadata.sort_values(by = ['pert_id', 'mean_cosine_dist_centered_by_batch'])\
    .groupby('pert_id')\
    .head(1)

In [7]:
len(df_filtered)

4941

In [8]:
df_filtered.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time
26739,CPC015_PHH_24H:BRD-A00100033-001-04-8:10,0.0203,CPC015_PHH_24H,PHH,0.911062,nifurtimox,10.0,BRD-A00100033,24.0
336,CPC001_HA1E_24H:BRD-A00267231-001-01-1:10,0.0022,CPC001_HA1E_24H,HA1E,0.897247,hemado,10.0,BRD-A00267231,24.0
31189,CPC019_MCF7_24H:BRD-A00420644-001-01-7:10,0.0,CPC019_MCF7_24H,MCF7,0.728397,-666,10.0,BRD-A00420644,24.0
16182,CPC010_HA1E_6H:BRD-A00474148-001-02-3:10,0.0007,CPC010_HA1E_6H,HA1E,0.913214,-666,10.0,BRD-A00474148,6.0
17487,CPC011_HA1E_6H:BRD-A00520476-001-05-8:10,0.0147,CPC011_HA1E_6H,HA1E,0.890211,"6h-pyrido[2,3-b][1,4]benzodiazepin-6-one, 11-[...",10.0,BRD-A00520476,6.0


### Importing L1000FWD drug metadata to match pert_ids

In [13]:
df_metadata = pd.read_csv('input/Drugs_metadata.csv', usecols = ['pert_iname','pert_id','inchi_key','pubchem_cid',
                                                                'LSM_id'])
df_metadata.dropna(subset = ['pubchem_cid'], inplace = True)
df_metadata['pubchem_cid'] = df_metadata['pubchem_cid'].astype(int)
df_metadata['inchi_key'] = df_metadata['inchi_key'].astype(str)
df_metadata['inchi_key'] = df_metadata['inchi_key'].apply(lambda x: str(x[9:]))

In [14]:
df_metadata.head()

Unnamed: 0,pert_id,LSM_id,pert_iname,inchi_key,pubchem_cid
0,BRD-A00100033,LSM-1232,nifurtimox,ARFHIAQFJWUCFH-UHFFFAOYSA-N,6842999
1,BRD-A00520476,LSM-22625,otenzepad,UBRKDAVQCKZSPO-UHFFFAOYSA-N,107867
2,BRD-A00546892,LSM-1235,biperiden,YSXKPIUOCJLQIE-UHFFFAOYSA-N,92151
3,BRD-A00758722,LSM-1237,noretynodrel,ICTXHFFSOAJUMG-OQPPHWFISA-N,5702095
4,BRD-A00827783,LSM-4299,dyphylline,KSCFJBIXMNOVSH-UHFFFAOYSA-N,3182


In [15]:
df_main = df_filtered.merge(df_metadata)

In [16]:
df_main.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time,LSM_id,pert_iname,inchi_key,pubchem_cid
0,CPC015_PHH_24H:BRD-A00100033-001-04-8:10,0.0203,CPC015_PHH_24H,PHH,0.911062,nifurtimox,10.0,BRD-A00100033,24.0,LSM-1232,nifurtimox,ARFHIAQFJWUCFH-UHFFFAOYSA-N,6842999
1,CPC001_HA1E_24H:BRD-A00267231-001-01-1:10,0.0022,CPC001_HA1E_24H,HA1E,0.897247,hemado,10.0,BRD-A00267231,24.0,LSM-1233,hemado,KOCIMZNSNPOGOP-UHFFFAOYSA-N,4043357
2,CPC019_MCF7_24H:BRD-A00420644-001-01-7:10,0.0,CPC019_MCF7_24H,MCF7,0.728397,-666,10.0,BRD-A00420644,24.0,LSM-6366,SA-3676,ASCBUEVCEVGOFP-UHFFFAOYSA-N,2853908
3,CPC010_HA1E_6H:BRD-A00474148-001-02-3:10,0.0007,CPC010_HA1E_6H,HA1E,0.913214,-666,10.0,BRD-A00474148,6.0,LSM-1234,BRD-A00474148,RCGAUPRLRFZAMS-UHFFFAOYSA-N,44825297
4,CPC011_HA1E_6H:BRD-A00520476-001-05-8:10,0.0147,CPC011_HA1E_6H,HA1E,0.890211,"6h-pyrido[2,3-b][1,4]benzodiazepin-6-one, 11-[...",10.0,BRD-A00520476,6.0,LSM-22625,otenzepad,UBRKDAVQCKZSPO-UHFFFAOYSA-N,107867


### Importing Drugbank mapping file

In [22]:
drugbank_mapping = pd.read_csv('../../metadata/mapping_files/lincs.tsv', sep = '\t')
drugbank_mapping = drugbank_mapping.rename(columns = {'inchi_key':'inchi_drugbank',
                                                     'lincs_id':'LSM_id'})

In [23]:
drugbank_mapping.head()

Unnamed: 0,drugbank_id,LSM_id,inchi_drugbank
0,DB00014,LSM-46023,BLCLNMBMMGCOAS-URPVMXJPSA-N
1,DB00014,LSM-5104,BLCLNMBMMGCOAS-URPVMXJPSA-N
2,DB00035,LSM-46024,NFLWUMRGJYTJIN-PNIOQBSNSA-N
3,DB00091,LSM-1703,PMATZTZNYRCHOR-CGLBZJNRSA-N
4,DB00091,LSM-2280,PMATZTZNYRCHOR-CGLBZJNRSA-N


In [24]:
df_drugbank = df_main.merge(drugbank_mapping)

958 small molecules found in Drugbank, however we need to include the L1000 drugs for analysis

In [25]:
len(df_drugbank)

1433

In [26]:
df_drugbank.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time,LSM_id,pert_iname,inchi_key,pubchem_cid,drugbank_id,inchi_drugbank
0,CPC015_PHH_24H:BRD-A00100033-001-04-8:10,0.0203,CPC015_PHH_24H,PHH,0.911062,nifurtimox,10.0,BRD-A00100033,24.0,LSM-1232,nifurtimox,ARFHIAQFJWUCFH-UHFFFAOYSA-N,6842999,DB11820,ARFHIAQFJWUCFH-IZZDOVSWSA-N
1,CPC015_A375_6H:BRD-A00546892-001-01-8:10,0.0413,CPC015_A375_6H,A375,0.948594,-666,10.0,BRD-A00546892,6.0,LSM-1235,biperiden,YSXKPIUOCJLQIE-UHFFFAOYSA-N,92151,DB00810,YSXKPIUOCJLQIE-UHFFFAOYSA-N
2,CPC002_VCAP_24H:BRD-A00758722-001-03-1:10,0.0,CPC002_VCAP_24H,VCAP,0.712465,norethynodrel,10.0,BRD-A00758722,24.0,LSM-1237,noretynodrel,ICTXHFFSOAJUMG-OQPPHWFISA-N,5702095,DB09371,ICTXHFFSOAJUMG-SLHNCBLASA-N
3,CPC016_MCF7_24H:BRD-A00993607-003-15-4:10,0.0021,CPC016_MCF7_24H,MCF7,0.921066,-666,10.0,BRD-A00993607,24.0,LSM-1238,alprenolol,PAZJSJFMUHDSTF-UHFFFAOYSA-N,66368,DB00866,PAZJSJFMUHDSTF-UHFFFAOYSA-N
4,CPC006_THP1_6H:BRD-A01145011-001-01-4:11.1,0.0,CPC006_THP1_6H,THP1,-666.0,py 7715,11.1,BRD-A01145011,6.0,LSM-6181,zebularine,RPQZTTQVRYEKCR-JJFBUQMESA-N,46783268,DB03068,RPQZTTQVRYEKCR-WCTZXXKLSA-N


In [27]:
len(set(df_drugbank['LSM_id']))

1245

In [28]:
len(set(df_drugbank['drugbank_id']))

1176

Creating a L1000FWD specific metadata table for small molecules not found in Drugbank

In [30]:
l1000_metadata = df_main
for index,row in l1000_metadata.iterrows():
    drugbank_list = df_drugbank['inchi_key'].tolist()
    if row['inchi_key'] in drugbank_list:
        l1000_metadata.drop(index, inplace = True) # drop drugbank drugs

In [31]:
len(l1000_metadata)

3620

In [33]:
l1000_metadata.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time,LSM_id,pert_iname,inchi_key,pubchem_cid
1,CPC001_HA1E_24H:BRD-A00267231-001-01-1:10,0.0022,CPC001_HA1E_24H,HA1E,0.897247,hemado,10.0,BRD-A00267231,24.0,LSM-1233,hemado,KOCIMZNSNPOGOP-UHFFFAOYSA-N,4043357
2,CPC019_MCF7_24H:BRD-A00420644-001-01-7:10,0.0,CPC019_MCF7_24H,MCF7,0.728397,-666,10.0,BRD-A00420644,24.0,LSM-6366,SA-3676,ASCBUEVCEVGOFP-UHFFFAOYSA-N,2853908
3,CPC010_HA1E_6H:BRD-A00474148-001-02-3:10,0.0007,CPC010_HA1E_6H,HA1E,0.913214,-666,10.0,BRD-A00474148,6.0,LSM-1234,BRD-A00474148,RCGAUPRLRFZAMS-UHFFFAOYSA-N,44825297
4,CPC011_HA1E_6H:BRD-A00520476-001-05-8:10,0.0147,CPC011_HA1E_6H,HA1E,0.890211,"6h-pyrido[2,3-b][1,4]benzodiazepin-6-one, 11-[...",10.0,BRD-A00520476,6.0,LSM-22625,otenzepad,UBRKDAVQCKZSPO-UHFFFAOYSA-N,107867
6,CPC008_PC3_24H:BRD-A00626522-001-05-7:10,0.0308,CPC008_PC3_24H,PC3,0.972653,ncgc00189187-01,10.0,BRD-A00626522,24.0,LSM-1236,BRD-A00626522,JESWDHXLYTYZAC-UHFFFAOYSA-N,16746329


In [34]:
l1000_metadata = l1000_metadata.rename(columns = {'pert_id':'Accession Number', 'pert_iname':'Common name',
                                                 'inchi_key':'InChI Key'})

In [36]:
# Exporting to metadata directory
l1000_metadata.to_csv('../../metadata/l1000fwd_metadata.tsv', sep = '\t', columns = ['Common name','InChI Key',
                                                                                    'Accession Number'])

In [None]:
# Creating up and downregulated genelists for significant signature IDs
with open('input/L1000FWD_signatures_up.gmt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    d_up = {line[0]:
            ([(str(g)).split(',')[0]
            for g in line[2:]])
            for line in reader if line[0] in df_inchi['sig_id'].tolist()}
    
with open('input/L1000FWD_signatures_down.gmt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    d_down = {line[0]:
              ([(str(g)).split(',')[0] 
                for g in line[2:]])
                for line in reader if line[0] in df_inchi['sig_id'].tolist()}

In [None]:
len(d_up)

### Exporting GMT file of drugs matched to signatures for Enrichr API Querying

In [None]:
os.chdir('../L1000FWD')

In [None]:
gmt_formatter(d_up_drugs, 'input/L1000FWD_drugsignatures_up.txt')
gmt_formatter(d_down_drugs, 'input/L1000FWD_drugsignatures_down.txt')

### Exporting a dictionary of pert ids matched to drug names for querying through SEP-L1000 API

In [None]:
# Exporting pert_ids matched to drug names for querying through SEP-L1000 API
df_pert2name = pd.DataFrame.from_dict(drug_name_dict, orient = 'index')
df_pert2name = df_pert2name.reset_index()
df_pert2name.columns = ['pert_id', 'drug_name']
df_pert2name.to_csv('input/pertid_to_name.tsv', sep = '\t', index = False)

In [None]:
df_pert2name.head()

### Exporting drug-set libraries in GMT format

#### Matching genes to approved symbols

In [None]:
# Transposing dictionary so that genes are set-labels and drugs are set members
drugsetlibrary_up = transposer(d_up_drugs)
drugsetlibrary_down = transposer(d_down_drugs)

In [None]:
def resolve_drugsetlibrary(library):
    gene_list = []
    df_genes = pd.DataFrame()

    for gene, drugs in library.items():
        gene_list.append(gene)
    df_genes['Gene Name'] = gene_list
    
    gene_resolver(df_genes) # script that will match synonyms to approved symbols
    gene_dict = df_genes.set_index('Gene Name').to_dict()['Approved Symbol']
    approved_genes = [v for k,v in gene_dict.items()] # separate comparison list of approved symbols
    
    # Get each approved symbol for synonyms
    drugsetlibrary = {gene_dict.get(k,k):v for k,v in library.items()}
    # Filter out any unapproved symbols
    drugsetlibrary = {k: drugsetlibrary[k] for k in approved_genes if k in drugsetlibrary}
    # Drop duplicates and sets with less than 5 drugs
    drugsetlibrary = {k:v for k,v in drugsetlibrary.items() if len(v) >=5}
    
    return drugsetlibrary

In [None]:
drugsetlibrary_upregulated = resolve_drugsetlibrary(drugsetlibrary_up)

drugsetlibrary_downregulated = resolve_drugsetlibrary(drugsetlibrary_down)

In [None]:
os.chdir('../data/L1000FWD')

gmt_formatter(drugsetlibrary_upregulated, 'L1000FWD_signature_drugsetlibrary_up.txt')

gmt_formatter(drugsetlibrary_downregulated, 'L1000FWD_signature_drugsetlibrary_down.txt')