In [1]:
import os, sys, json
import pandas as pd
import numpy as np
import h5py
from pymongo import MongoClient
import requests

from cmapPy import pandasGEXpress as pdGEX
from cmapPy.pandasGEXpress import write_gctx as write_gctx

def file2list(fn, idx, sep='\t', header=False):
	"""read a file into a list"""
	l = []
	with open (fn, 'r') as f:
		if header:
			next(f)
		for line in f:
			if not line.startswith('#'):
				sl = line.strip().split(sep)
				t = sl[idx]
				l.append(t)
	return l
RURL = 'http://146.203.54.239:31722/custom/SigineDMOA'
MONGOURI = 'mongodb://146.203.54.131:27017/L1000FWD'

client = MongoClient(MONGOURI)
DB = client['L1000FWD']
COLL_SIG = DB['sigs']

In [2]:
# Get the list of sig_ids from the API
all_sig_ids = requests.get('http://amp.pharm.mssm.edu/L1000FWD/sig_ids')
all_sig_ids = all_sig_ids.json()['sig_ids']
print len(all_sig_ids)
print all_sig_ids[:10]


42809
[u'AML001_CD34_6H:BRD-K43389675:10', u'AML001_PC3_6H:BRD-A19037878:0.37037', u'AML001_PC3_6H:BRD-A19037878:1.11111', u'AML001_PC3_6H:BRD-A19037878:10', u'AML001_PC3_6H:BRD-A19037878:3.33333', u'AML001_PC3_6H:BRD-A19500257:0.37037', u'AML001_PC3_6H:BRD-A19500257:1.11111', u'AML001_PC3_6H:BRD-A19500257:10', u'AML001_PC3_6H:BRD-A19500257:3.33333', u'AML001_PC3_6H:BRD-A45664787:10']


In [8]:
# Get sig_meta
sig_meta_df = [doc for doc in 
               COLL_SIG.find(
                   {'sig_id': {'$in': all_sig_ids}},
                   {
                       '_id':False,
                       'cell_id':True,
                       'batch':True,
                       'pert_dose':True,
                       'pert_desc':True,
                       'pert_id':True,
                       'sig_id':True,
                       'pert_time':True,
                       'SCS_centered_by_batch':True,
                       'mean_cosine_dist_centered_by_batch':True,
                   })]
sig_meta_df = pd.DataFrame(sig_meta_df).set_index('sig_id')
sig_meta_df = sig_meta_df.loc[all_sig_ids]
print sig_meta_df.shape
sig_meta_df.head()

(42809, 8)


Unnamed: 0_level_0,SCS_centered_by_batch,batch,cell_id,mean_cosine_dist_centered_by_batch,pert_desc,pert_dose,pert_id,pert_time
sig_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AML001_CD34_6H:BRD-K43389675:10,0.0,AML001_CD34_6H,CD34,0.853471,DAUNORUBICIN,10.0,BRD-K43389675,6.0
AML001_PC3_6H:BRD-A19037878:0.37037,0.0,AML001_PC3_6H,PC3,0.505995,TRICHOSTATIN_A,0.37037,BRD-A19037878,6.0
AML001_PC3_6H:BRD-A19037878:1.11111,0.0,AML001_PC3_6H,PC3,0.676204,TRICHOSTATIN_A,1.11111,BRD-A19037878,6.0
AML001_PC3_6H:BRD-A19037878:10,0.0,AML001_PC3_6H,PC3,0.747633,TRICHOSTATIN_A,10.0,BRD-A19037878,6.0
AML001_PC3_6H:BRD-A19037878:3.33333,0.0,AML001_PC3_6H,PC3,0.659851,TRICHOSTATIN_A,3.33333,BRD-A19037878,6.0


In [None]:
# Convert dtype to str
for col in sig_meta_df.columns:
    if sig_meta_df.dtypes[col] == object:
        sig_meta_df[col] = sig_meta_df[col].astype(str)

In [3]:
# Get ordered probes
PROBES = json.load(open('../data/rid.json', 'rb'))
PROBES_LM1000 = file2list('../data/rid_lm1000.txt', 0)
print len(PROBES), len(PROBES_LM1000)

22268 978


In [63]:
COLL_GENES = client['LINCS_L1000_limma']['geneinfo']
cur = COLL_GENES.find({}, {'_id':False ,'pr_id':True, 'pr_gene_symbol':True})
probes2genes = pd.DataFrame([doc for doc in cur])
print probes2genes.shape

probes2genes = probes2genes.replace({'-666': np.nan}).set_index('pr_id')
print probes2genes.shape
probes2genes.head()

(22269, 2)
(22269, 1)


Unnamed: 0_level_0,pr_gene_symbol
pr_id,Unnamed: 1_level_1
202938_x_at,
204006_s_at,
204060_s_at,
204419_x_at,
204438_at,


In [65]:
probes2genes.loc[PROBES].shape

(22268, 1)

In [10]:
# Get the CD matrices (full and lm)
cd_mat_lm = np.zeros((len(all_sig_ids), 978), dtype=np.float)
cd_mat_full = np.zeros((len(all_sig_ids), 22268), dtype=np.float)

for i, sig_id in enumerate(all_sig_ids):
    doc = COLL_SIG.find_one({'sig_id': sig_id}, 
        {'_id':False,
        'CD_center_LM':True,
        'CD_center_Full':True,
        })

    cd_mat_lm[i] = doc['CD_center_LM']
    cd_mat_full[i] = doc['CD_center_Full']
    if i % 500 == 0:
        print i, len(all_sig_ids)


0 42809
500 42809
1000 42809
1500 42809
2000 42809
2500 42809
3000 42809
3500 42809
4000 42809
4500 42809
5000 42809
5500 42809
6000 42809
6500 42809
7000 42809
7500 42809
8000 42809
8500 42809
9000 42809
9500 42809
10000 42809
10500 42809
11000 42809
11500 42809
12000 42809
12500 42809
13000 42809
13500 42809
14000 42809
14500 42809
15000 42809
15500 42809
16000 42809
16500 42809
17000 42809
17500 42809
18000 42809
18500 42809
19000 42809
19500 42809
20000 42809
20500 42809
21000 42809
21500 42809
22000 42809
22500 42809
23000 42809
23500 42809
24000 42809
24500 42809
25000 42809
25500 42809
26000 42809
26500 42809
27000 42809
27500 42809
28000 42809
28500 42809
29000 42809
29500 42809
30000 42809
30500 42809
31000 42809
31500 42809
32000 42809
32500 42809
33000 42809
33500 42809
34000 42809
34500 42809
35000 42809
35500 42809
36000 42809
36500 42809
37000 42809
37500 42809
38000 42809
38500 42809
39000 42809
39500 42809
40000 42809
40500 42809
41000 42809
41500 42809
42000 42809
4250

In [49]:
# Create gct object
gctoo_full = pdGEX.GCToo.GCToo(pd.DataFrame(cd_mat_full, columns=PROBES, index=all_sig_ids), 
                              col_metadata_df=pd.DataFrame(index=PROBES), 
                              row_metadata_df=sig_meta_df,
                              make_multiindex=True)
# write gct object to gctx file
write_gctx.write(gctoo_full, "../data/download/CD_signatures_full_%dx%d" % gctoo_full.data_df.shape)

In [51]:
# Create gct object
gctoo_lm = pdGEX.GCToo.GCToo(pd.DataFrame(cd_mat_lm, columns=PROBES_LM1000, index=all_sig_ids), 
                              col_metadata_df=pd.DataFrame(index=PROBES_LM1000), 
                              row_metadata_df=sig_meta_df,
                              make_multiindex=True)
# write gct object to gctx file
write_gctx.write(gctoo_lm, "../data/download/CD_signatures_LM_%dx%d" % gctoo_lm.data_df.shape)

In [52]:
del gctoo_full, gctoo_lm

In [54]:
# compute adj_mat
from sklearn.metrics import pairwise_distances
adj_mat = pairwise_distances(cd_mat_lm, metric='cosine')
print adj_mat.shape


(42809, 42809)
1.5711884356

NameError: name 'adj_max' is not defined

In [55]:
print adj_mat.max(), adj_mat.min()

 1.5711884356 0.0


In [56]:
adj_mat = 1- adj_mat
print adj_mat.max(), adj_mat.min()

1.0 -0.571188435598


In [57]:
gctoo_adj_mat = pdGEX.GCToo.GCToo(pd.DataFrame(adj_mat, columns=all_sig_ids, index=all_sig_ids),
                                  row_metadata_df=sig_meta_df,
                                  col_metadata_df=sig_meta_df,
                                  make_multiindex=True
                                 )
write_gctx.write(gctoo_adj_mat, "../data/download/Adjacency_matrix_LM_space_%dx%d" % gctoo_adj_mat.data_df.shape)

In [60]:
del gctoo_adj_mat 

In [62]:
# Write metadata dfs
sig_meta_df.to_csv('../data/download/CD_signature_metadata.csv')

In [66]:
probes2genes.loc[PROBES_LM1000].to_csv('../data/download/Probes_L1000_metadata.csv')
probes2genes.loc[PROBES].to_csv('../data/download/Probes_full_metadata.csv')