## Associating L1000FWD drug signatures with respective drug names
### Database : http://amp.pharm.mssm.edu/L1000FWD/

In [1]:
import pandas as pd
import os
import csv
from itertools import islice
from collections import defaultdict
import json

In [2]:
os.chdir('../scripts')
from export_script import *
from gene_resolver import *
os.chdir('../L1000FWD')

In [3]:
df_sig_metadata = pd.read_csv('input/CD_signature_metadata.csv')
df_sig_metadata['pert_desc'] = df_sig_metadata['pert_desc'].str.lower()

In [4]:
len(df_sig_metadata)

42809

In [5]:
df_sig_metadata.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time
0,AML001_CD34_6H:BRD-K43389675:10,0.0,AML001_CD34_6H,CD34,0.853471,daunorubicin,10.0,BRD-K43389675,6.0
1,AML001_PC3_6H:BRD-A19037878:0.37037,0.0,AML001_PC3_6H,PC3,0.505995,trichostatin_a,0.37037,BRD-A19037878,6.0
2,AML001_PC3_6H:BRD-A19037878:1.11111,0.0,AML001_PC3_6H,PC3,0.676204,trichostatin_a,1.11111,BRD-A19037878,6.0
3,AML001_PC3_6H:BRD-A19037878:10,0.0,AML001_PC3_6H,PC3,0.747633,trichostatin_a,10.0,BRD-A19037878,6.0
4,AML001_PC3_6H:BRD-A19037878:3.33333,0.0,AML001_PC3_6H,PC3,0.659851,trichostatin_a,3.33333,BRD-A19037878,6.0


In [6]:
# Creating separate files for up and down genes

with open('input/CD_signatures_binary_42809.gmt', 'r') as fin, open('input/L1000FWD_drug_up_genes.gmt', 'w') as fout:
    fout.writelines(islice(fin, 0, None, 2))
fout.close()

with open('input/CD_signatures_binary_42809.gmt', 'r') as fin, open('input/L1000FWD_drug_down_genes.gmt','w') as fout:
    fout.writelines(islice(fin, 1, None, 2))
fout.close()

In [7]:
with open('input/L1000FWD_signatures_up.gmt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    # Creating dictionary of signature ids matched to gene signatures (upregulated)
    d_up = {line[0]:
            ([(str(g)).split(',')[0]
            for g in line[2:]])
            for line in reader}
    
with open('input/L1000FWD_signatures_down.gmt', 'r') as f:
    reader = csv.reader(f, delimiter = '\t')
    # Creating dictionary of signature ids matched to gene signatures (downregulated)
    d_down = {line[0]:
              ([(str(g)).split(',')[0] 
                for g in line[2:]])
                for line in reader}

In [8]:
print(len(d_up))
print(len(d_down))

42809
42809


### Filtering df_sig_metadata by lowest mean_cosine_dist_centered_by_batch

In [9]:
# Get the smallest mean cosine distance for each pert id #
row_meta_df_g = df_sig_metadata.sort_values(by = ['pert_id', 'mean_cosine_dist_centered_by_batch'])\
    .groupby('pert_id')\
    .head(1)

In [10]:
len(row_meta_df_g)

4941

In [11]:
# Create a new dataframe to append the smallest mean cosine distance signatures #
df_filtered = pd.DataFrame()
for ind in row_meta_df_g.index:
    to_append = df_sig_metadata.loc[[ind]]
    df_filtered = df_filtered.append(to_append)

In [12]:
for rid in df_filtered.index:
    df_filtered.loc[rid,'pert_desc'] = row_meta_df_g.loc[rid,'pert_desc']
df_filtered['pert_desc'] = df_filtered['pert_desc'].apply(lambda x: x.strip("b'"))

In [13]:
df_filtered.head()

Unnamed: 0,sig_id,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time
26739,CPC015_PHH_24H:BRD-A00100033-001-04-8:10,0.0203,CPC015_PHH_24H,PHH,0.911062,nifurtimox,10.0,BRD-A00100033,24.0
336,CPC001_HA1E_24H:BRD-A00267231-001-01-1:10,0.0022,CPC001_HA1E_24H,HA1E,0.897247,hemado,10.0,BRD-A00267231,24.0
31189,CPC019_MCF7_24H:BRD-A00420644-001-01-7:10,0.0,CPC019_MCF7_24H,MCF7,0.728397,-666,10.0,BRD-A00420644,24.0
16182,CPC010_HA1E_6H:BRD-A00474148-001-02-3:10,0.0007,CPC010_HA1E_6H,HA1E,0.913214,-666,10.0,BRD-A00474148,6.0
17487,CPC011_HA1E_6H:BRD-A00520476-001-05-8:10,0.0147,CPC011_HA1E_6H,HA1E,0.890211,"6h-pyrido[2,3-b][1,4]benzodiazepin-6-one, 11-[...",10.0,BRD-A00520476,6.0


In [14]:
# List of significant signature ids
sig_id_list = set(df_filtered['sig_id'].tolist())

# Creating reference dict of signature ids paired to pert ids (required for matching sig ids to correct drug names)
pert_id_dict = df_filtered.set_index('sig_id').to_dict()['pert_id']

In [15]:
# Retaining significant signature ids only in the up and downregulated dictionaries
d_up_filtered = {k:v for k, v in d_up.items() if k in sig_id_list}
d_down_filtered = {k:v for k, v in d_down.items() if k in sig_id_list}

In [16]:
print(len(d_up_filtered))
print(len(d_down_filtered))

4941
4941


In [17]:
# Replacing signature_id dictionary key with pert_id
d_up_filtered_pertids = {pert_id_dict.get(k,k): v for k,v in d_up_filtered.items()}
d_down_filtered_pertids = {pert_id_dict.get(k,k): v for k,v in d_down_filtered.items()}

In [18]:
print(len(d_up_filtered_pertids))
print(len(d_down_filtered_pertids))

4941
4941


### Importing drug metadata to match pert_ids to drug names 

In [19]:
df_metadata = pd.read_csv('input/Drugs_metadata.csv')
df_metadata['pert_iname'] = df_metadata['pert_iname'].str.lower()

In [20]:
df_metadata.head()

Unnamed: 0,pert_id,LSM_id,pert_iname,alt_name,canonical_smiles,inchi_key,inchi_string,molecular_formula,molecular_wt,pert_collection,pert_summary,pert_url,pubchem_cid
0,BRD-A00100033,LSM-1232,nifurtimox,,CC1CS(=O)(=O)CCN1N=Cc2ccc(o2)[N+](=O)[O-],InChIKey=ARFHIAQFJWUCFH-UHFFFAOYSA-N,"InChI=1S/C10H13N3O5S/c1-8-7-19(16,17)5-4-12(8)...",C10H13N3O5S,287.292,BIOA,,http://en.wikipedia.org/wiki/Nifurtimox,6842999
1,BRD-A00520476,LSM-22625,otenzepad,AF-DX 116,CCN(CC)CC1CCCCN1CC(=O)N2c3ccccc3C(=O)Nc4cccnc24,InChIKey=UBRKDAVQCKZSPO-UHFFFAOYSA-N,InChI=1S/C24H31N5O2/c1-3-27(4-2)16-18-10-7-8-1...,C24H31N5O2,421.535,BIOA,,http://www.tocris.com/dispprod.php?ItemId=2135...,107867
2,BRD-A00546892,LSM-1235,biperiden,S1285,OC(CCN1CCCCC1)(C2CC3CC2C=C3)c4ccccc4,InChIKey=YSXKPIUOCJLQIE-UHFFFAOYSA-N,"InChI=1S/C21H29NO/c23-21(19-7-3-1-4-8-19,11-14...",C21H29NO,311.461,BIOA,Biperiden is an antiparkinsonian agent of the ...,"http://en.wikipedia.org/wiki/Biperiden,http://...",92151
3,BRD-A00758722,LSM-1237,noretynodrel,norethynodrel,C[C@@]12CCC3C(CCC4=C3CCC(=O)C4)C2CC[C@@]1(O)C#C,InChIKey=ICTXHFFSOAJUMG-OQPPHWFISA-N,InChI=1S/C20H26O2/c1-3-20(22)11-9-18-17-6-4-13...,C20H26O2,298.419,BIOA,,"http://en.wikipedia.org/wiki/Norethynodrel,htt...",5702095
4,BRD-A00827783,LSM-4299,dyphylline,diprophylline,Cn1c(=O)n(C)c2ncn(CC(O)CO)c2c1=O,InChIKey=KSCFJBIXMNOVSH-UHFFFAOYSA-N,InChI=1S/C10H14N4O4/c1-12-8-7(9(17)13(2)10(12)...,C10H14N4O4,254.243,BIOA,,http://en.wikipedia.org/wiki/Dyphylline,3182


In [21]:
# Creating list of significant pert_ids
pert_id_list = []
for k,v in d_up_filtered_pertids.items():
    pert_id_list.append(k)
    
# Filtering the metadata dataframe by significant pert_ids
for index, row in df_metadata.iterrows():
    pert_id = row.loc['pert_id']
    if pert_id not in pert_id_list:
        df_metadata.drop(index, inplace = True)

In [22]:
len(df_metadata)

4941

In [23]:
# Dropping all duplicate drug names, leaving us with one unique drug for each pert_id
'''
All these drugs are significant regardless, and it is preferrable for there to be one drug name matched to each
pert_id
'''
df_metadata = df_metadata.drop_duplicates(subset = 'pert_iname')

In [24]:
len(df_metadata)

4606

In [25]:
df_metadata.head()

Unnamed: 0,pert_id,LSM_id,pert_iname,alt_name,canonical_smiles,inchi_key,inchi_string,molecular_formula,molecular_wt,pert_collection,pert_summary,pert_url,pubchem_cid
0,BRD-A00100033,LSM-1232,nifurtimox,,CC1CS(=O)(=O)CCN1N=Cc2ccc(o2)[N+](=O)[O-],InChIKey=ARFHIAQFJWUCFH-UHFFFAOYSA-N,"InChI=1S/C10H13N3O5S/c1-8-7-19(16,17)5-4-12(8)...",C10H13N3O5S,287.292,BIOA,,http://en.wikipedia.org/wiki/Nifurtimox,6842999
1,BRD-A00520476,LSM-22625,otenzepad,AF-DX 116,CCN(CC)CC1CCCCN1CC(=O)N2c3ccccc3C(=O)Nc4cccnc24,InChIKey=UBRKDAVQCKZSPO-UHFFFAOYSA-N,InChI=1S/C24H31N5O2/c1-3-27(4-2)16-18-10-7-8-1...,C24H31N5O2,421.535,BIOA,,http://www.tocris.com/dispprod.php?ItemId=2135...,107867
2,BRD-A00546892,LSM-1235,biperiden,S1285,OC(CCN1CCCCC1)(C2CC3CC2C=C3)c4ccccc4,InChIKey=YSXKPIUOCJLQIE-UHFFFAOYSA-N,"InChI=1S/C21H29NO/c23-21(19-7-3-1-4-8-19,11-14...",C21H29NO,311.461,BIOA,Biperiden is an antiparkinsonian agent of the ...,"http://en.wikipedia.org/wiki/Biperiden,http://...",92151
3,BRD-A00758722,LSM-1237,noretynodrel,norethynodrel,C[C@@]12CCC3C(CCC4=C3CCC(=O)C4)C2CC[C@@]1(O)C#C,InChIKey=ICTXHFFSOAJUMG-OQPPHWFISA-N,InChI=1S/C20H26O2/c1-3-20(22)11-9-18-17-6-4-13...,C20H26O2,298.419,BIOA,,"http://en.wikipedia.org/wiki/Norethynodrel,htt...",5702095
5,BRD-A00993607,LSM-1238,alprenolol,,CC(C)NCC(O)COc1ccccc1CC=C,InChIKey=PAZJSJFMUHDSTF-UHFFFAOYSA-N,InChI=1S/C15H23NO2/c1-4-7-13-8-5-6-9-15(13)18-...,C15H23NO2,285.81,BIOA,,http://en.wikipedia.org/wiki/Alprenolol,66368


In [26]:
# Creating reference dictionary of pert_ids matched to drug names
drug_name_dict = df_metadata.set_index('pert_id').to_dict()['pert_iname']

# Creating reference list of significant drug_names 
drug_names = df_metadata['pert_iname'].tolist()

### Exporting updated metadata for harmonization

In [None]:
# Exporting edited drug metadata containing only signifcant signatures
df_metadata.to_csv('input/Drugs_metadata_significant.csv', index = False)

In [31]:
# Creating list of harmonized drugs & filtering drug_name_dict 
harmonized_drugs = df_harmonized['Drug_name'].tolist()
drug_name_dict = {k:v for k,v in drug_name_dict.items() if v in harmonized_drugs}

In [32]:
len(drug_name_dict)

4447

In [33]:
# Replacing pert_id dictionary key with drug name 
d_up_drugs = {drug_name_dict.get(k,k): v for k,v in d_up_filtered_pertids.items()}
d_down_drugs = {drug_name_dict.get(k,k): v for k,v in d_down_filtered_pertids.items()}

# Removing unmatched keys
d_up_drugs = {k:v for k,v in d_up_drugs.items() if k in drug_names}
d_down_drugs = {k:v for k,v in d_down_drugs.items() if k in drug_names}

In [34]:
print(len(d_up_drugs))
print(len(d_down_drugs))

4447
4447


### Exporting GMT file of drugs matched to signatures for Enrichr API Querying

In [48]:
os.chdir('../L1000FWD')

In [None]:
gmt_formatter(d_up_drugs, 'input/L1000FWD_drugsignatures_up.txt')
gmt_formatter(d_down_drugs, 'input/L1000FWD_drugsignatures_down.txt')

### Exporting a dictionary of pert ids matched to drug names for querying through SEP-L1000 API

In [None]:
# Exporting pert_ids matched to drug names for querying through SEP-L1000 API
df_pert2name = pd.DataFrame.from_dict(drug_name_dict, orient = 'index')
df_pert2name = df_pert2name.reset_index()
df_pert2name.columns = ['pert_id', 'drug_name']
df_pert2name.to_csv('input/pertid_to_name.tsv', sep = '\t', index = False)

In [None]:
df_pert2name.head()

### Exporting drug-set libraries in GMT format

#### Matching genes to approved symbols

In [35]:
# Transposing dictionary so that genes are set-labels and drugs are set members
drugsetlibrary_up = transposer(d_up_drugs)
drugsetlibrary_down = transposer(d_down_drugs)

In [42]:
def resolve_drugsetlibrary(library):
    gene_list = []
    df_genes = pd.DataFrame()

    for gene, drugs in library.items():
        gene_list.append(gene)
    df_genes['Gene Name'] = gene_list
    
    gene_resolver(df_genes) # script that will match synonyms to approved symbols
    gene_dict = df_genes.set_index('Gene Name').to_dict()['Approved Symbol']
    approved_genes = [v for k,v in gene_dict.items()] # separate comparison list of approved symbols
    
    # Get each approved symbol for synonyms
    drugsetlibrary = {gene_dict.get(k,k):v for k,v in library.items()}
    # Filter out any unapproved symbols
    drugsetlibrary = {k: drugsetlibrary[k] for k in approved_genes if k in drugsetlibrary}
    # Drop sets with less than 5 drugs
    drugsetlibrary = {k:v for k,v in drugsetlibrary.items() if len(v) >=5}
    
    return drugsetlibrary

In [44]:
drugsetlibrary_upregulated = resolve_drugsetlibrary(drugsetlibrary_up)

drugsetlibrary_downregulated = resolve_drugsetlibrary(drugsetlibrary_down)

In [51]:
os.chdir('../data/L1000FWD')

gmt_formatter(drugsetlibrary_upregulated, 'L1000FWD_signature_drugsetlibrary_up.txt')

gmt_formatter(drugsetlibrary_downregulated, 'L1000FWD_signature_drugsetlibrary_down.txt')