# CMAP

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: https://www.broadinstitute.org/cmap/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/cmap/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/cmap_association_2017_03_09.tsv.zip', sep='\t', index_col=0)

In [4]:
df.head()

Unnamed: 0,signature_fk,gene_symbol,fold_change,p_value,q_value
1,1,AAR2,-0.435962,0.002949103,0.016078
2,1,AASDHPPT,0.663748,1.06631e-05,0.000167
3,1,ABCC1,0.29452,0.006385891,0.030025
4,1,ABCC10,0.309518,0.009982904,0.042797
5,1,ABCC4,0.981755,8.298603e-08,4e-06


In [5]:
df.shape

(197234, 5)

# Load Sample Meta Data

In [6]:
sample_meta = pd.read_csv('Input/cmap_signature_2017_03_09.tsv.zip', sep='\t', index_col=0)

In [7]:
sample_meta.head()

Unnamed: 0_level_0,DRUG_NAME,CELL_NAME,CELL_INFO,PERTUBATION_TIME,PERTUBATION_TIME_UNIT,PERTUBATION_DOSE,PERTUBATION_DOSE_UNIT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,15-delta prostaglandin J2,HL60,human promyelocytic cell line established by l...,6.0,h,10.0,μM
2,15-delta prostaglandin J2,MCF7,human breast epithelial adenocarcinoma cell li...,6.0,h,10.0,μM
3,15-delta prostaglandin J2,PC3,epithelial cell line established from human pr...,6.0,h,10.0,μM
4,3-hydroxy-DL-kynurenine,MCF7,human breast epithelial adenocarcinoma cell li...,6.0,h,17.8,μM
5,6-benzylaminopurine,MCF7,human breast epithelial adenocarcinoma cell li...,6.0,h,17.8,μM


# Get Relevant Data

In [8]:
# map sample meta data to id
lst = []
for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    drug = sample_meta.ix[df.ix[index, 'signature_fk'], 'DRUG_NAME']
    celline = sample_meta.ix[df.ix[index, 'signature_fk'], 'CELL_NAME']
    time = sample_meta.ix[df.ix[index, 'signature_fk'], 'PERTUBATION_TIME']
    timeUnit = sample_meta.ix[df.ix[index, 'signature_fk'], 'PERTUBATION_TIME_UNIT']
    dose = sample_meta.ix[df.ix[index, 'signature_fk'], 'PERTUBATION_DOSE']
    doseUnit = sample_meta.ix[df.ix[index, 'signature_fk'], 'PERTUBATION_DOSE_UNIT']
    lst.append('%s_%s_%s%s_%s%s' %(drug, celline, time, timeUnit, dose, doseUnit))
    
df['signature_fk'] = lst

Progeres: 100%  197234 Out of 197234   

In [9]:
df = df[['gene_symbol', 'signature_fk', 'fold_change']]

In [10]:
df.head()

Unnamed: 0,gene_symbol,signature_fk,fold_change
1,AAR2,15-delta prostaglandin J2_HL60_6.0h_10.0μM,-0.435962
2,AASDHPPT,15-delta prostaglandin J2_HL60_6.0h_10.0μM,0.663748
3,ABCC1,15-delta prostaglandin J2_HL60_6.0h_10.0μM,0.29452
4,ABCC10,15-delta prostaglandin J2_HL60_6.0h_10.0μM,0.309518
5,ABCC4,15-delta prostaglandin J2_HL60_6.0h_10.0μM,0.981755


# Create Matrix

In [11]:
genes = df['gene_symbol'].unique().tolist()
samples = df['signature_fk'].unique().tolist()

In [12]:
df.set_index('signature_fk', inplace=True)

In [13]:
matrix = pd.DataFrame(index=genes, columns=samples)
matrix.replace(np.nan, 0, inplace=True)

In [14]:
for i,col in enumerate(matrix.columns):
    
    progress = ((i+1)/len(matrix.columns))*100
        
    sys.stdout.write("Progress: %d Out of %d which is %d%%   \r" % ((i+1), len(matrix.columns), progress))
    sys.stdout.flush()
    
    if df.loc[col, 'gene_symbol'].shape[0] > 1:
        matrix.ix[df.loc[col, 'gene_symbol'].values.tolist(), col] = df.ix[col, 'fold_change'].values.tolist()
    else:
        matrix.ix[df.loc[col, 'gene_symbol'], col] = df.ix[col, 'fold_change']

Progress: 200 Out of 200 which is 100%   

In [15]:
matrix.head()

Unnamed: 0,15-delta prostaglandin J2_HL60_6.0h_10.0μM,15-delta prostaglandin J2_MCF7_6.0h_10.0μM,15-delta prostaglandin J2_PC3_6.0h_10.0μM,3-hydroxy-DL-kynurenine_MCF7_6.0h_17.8μM,6-benzylaminopurine_MCF7_6.0h_17.8μM,6-bromoindirubin-3'-oxime_MCF7_6.0h_0.5μM,6-bromoindirubin-3'-oxime_PC3_6.0h_0.5μM,AG-013608_MCF7_6.0h_10.0μM,AG-013608_PC3_6.0h_10.0μM,LM-1685_MCF7_6.0h_10.0μM,...,valproic acid_HL60_6.0h_200.0μM,valproic acid_HL60_6.0h_500.0μM,valproic acid_HL60_6.0h_1000.0μM,valproic acid_MCF7_6.0h_200.0μM,valproic acid_MCF7_6.0h_500.0μM,valproic acid_MCF7_6.0h_1000.0μM,valproic acid_PC3_6.0h_1000.0μM,vincamine_MCF7_6.0h_11.2μM,wortmannin_HL60_6.0h_0.01μM,wortmannin_MCF7_6.0h_0.01μM
AAR2,-0.435962,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.644113,0.32063,0.0,0.0,0.251598,0.407431,0.0,0.0,0.0
AASDHPPT,0.663748,0.0,0.0,0.0,0.0,0.0,0.47792,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABCC1,0.29452,-0.775475,-0.732066,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.514414,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABCC10,0.309518,0.328248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.44478,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ABCC4,0.981755,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,-0.43702,0.521003,0.0,0.0,0.0


In [16]:
matrix.shape

(11962, 200)

# Save Unfiltered Matrix To File

In [17]:
filename = '~/./Documents/Harmonizome/CMAP/Output/cmap_matrix_unfilltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
matrix.to_csv(filename, sep='\t', compression='gzip')

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [18]:
uf.mapgenesymbols(matrix)

Progeres: 97%  11614 Out of 11962   

In [20]:
matrix.shape

(11850, 200)

# Merge Duplicate Genes By Rows

In [21]:
normalized_matrix = uf.merge(matrix, 'row', 'mean')

In [22]:
normalized_matrix.shape

(11801, 200)

# Save Filtered Matrix

In [23]:
filename = '~/./Documents/Harmonizome/CMAP/Output/cmap_matrix_filltered_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
normalized_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Tertiary Matrix

In [24]:
tertiary_matrix = uf.createTertiaryMarix(normalized_matrix)

Progeres: 100%  200 Out of 200   

In [25]:
tertiary_matrix.head()

Unnamed: 0_level_0,15-delta prostaglandin J2_HL60_6.0h_10.0μM,15-delta prostaglandin J2_MCF7_6.0h_10.0μM,15-delta prostaglandin J2_PC3_6.0h_10.0μM,3-hydroxy-DL-kynurenine_MCF7_6.0h_17.8μM,6-benzylaminopurine_MCF7_6.0h_17.8μM,6-bromoindirubin-3'-oxime_MCF7_6.0h_0.5μM,6-bromoindirubin-3'-oxime_PC3_6.0h_0.5μM,AG-013608_MCF7_6.0h_10.0μM,AG-013608_PC3_6.0h_10.0μM,LM-1685_MCF7_6.0h_10.0μM,...,valproic acid_HL60_6.0h_200.0μM,valproic acid_HL60_6.0h_500.0μM,valproic acid_HL60_6.0h_1000.0μM,valproic acid_MCF7_6.0h_200.0μM,valproic acid_MCF7_6.0h_500.0μM,valproic acid_MCF7_6.0h_1000.0μM,valproic acid_PC3_6.0h_1000.0μM,vincamine_MCF7_6.0h_11.2μM,wortmannin_HL60_6.0h_0.01μM,wortmannin_MCF7_6.0h_0.01μM
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


# Save Teriary Matrix

In [26]:
filename = '~/./Documents/Harmonizome/CMAP/Output/cmap_tertiary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
tertiary_matrix.to_csv(filename, sep='\t', compression='gzip')

### Path to output files

In [27]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CMAP/Output/'

# Create Up Gene Set Library

In [36]:
name = 'cmap_gene_up_set'

In [37]:
uf.createUpGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  200 Out of 200   

# Create Down Gene Set Library

In [38]:
name = 'cmap_gene_down_set'

In [39]:
uf.createDownGeneSetLib(tertiary_matrix, path, name)

Progeres: 100%  200 Out of 200   

# Create Up Attribute Library

In [32]:
name = 'cmap_attribute_up_set'

In [33]:
uf.createUpAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  11801 Out of 11801   

# Create Down Attribute Library

In [34]:
name = 'cmap_attribute_down_set'

In [35]:
uf.createDownAttributeSetLib(tertiary_matrix, path, name)

Progeres: 100%  11801 Out of 11801   

# Create Gene Similarity Matrix

In [40]:
gene_similarity_matix = uf.createSimilarityMatrix(normalized_matrix, 'cosine')

In [41]:
gene_similarity_matix.head()

index,A1CF,A2M,A4GALT,A4GNT,AAAS,AACS,AADAC,AAGAB,AAK1,AAMDC,...,ZSWIM1,ZSWIM8,ZW10,ZWILCH,ZWINT,ZXDB,ZXDC,ZYX,ZZEF1,ZZZ3
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1CF,1.0,0.0,0.0,0.0,0.0,-0.026048,0.0,-0.09215,-0.222799,-0.073334,...,-0.207944,-0.100289,-0.005417,-0.00797,0.001735,0.0,0.013545,0.03141,0.0,0.045275
A2M,0.0,1.0,0.0,0.0,-0.059691,0.133125,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-0.047145,0.0,0.0,-0.037058,0.0,0.043078
A4GALT,0.0,0.0,1.0,0.0,-0.076398,-0.032549,0.196261,0.049255,0.056752,0.080355,...,0.0,0.164826,-0.316071,0.091921,-0.096001,0.0,-0.221081,0.165628,0.028054,-0.191353
A4GNT,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AAAS,0.0,-0.059691,-0.076398,0.0,1.0,-0.139343,-0.247896,-0.117362,-0.056557,-0.073071,...,0.0,-0.090048,0.338495,-0.131262,0.106512,0.0,0.352138,-0.31021,-0.300689,0.219232


# Save Gene Similarity Matrix

In [42]:
filename = 'Output/cmap_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [43]:
attribute_similarity_matix = uf.createSimilarityMatrix(normalized_matrix.T, 'cosine')

In [44]:
attribute_similarity_matix.head()

Unnamed: 0,15-delta prostaglandin J2_HL60_6.0h_10.0μM,15-delta prostaglandin J2_MCF7_6.0h_10.0μM,15-delta prostaglandin J2_PC3_6.0h_10.0μM,3-hydroxy-DL-kynurenine_MCF7_6.0h_17.8μM,6-benzylaminopurine_MCF7_6.0h_17.8μM,6-bromoindirubin-3'-oxime_MCF7_6.0h_0.5μM,6-bromoindirubin-3'-oxime_PC3_6.0h_0.5μM,AG-013608_MCF7_6.0h_10.0μM,AG-013608_PC3_6.0h_10.0μM,LM-1685_MCF7_6.0h_10.0μM,...,valproic acid_HL60_6.0h_200.0μM,valproic acid_HL60_6.0h_500.0μM,valproic acid_HL60_6.0h_1000.0μM,valproic acid_MCF7_6.0h_200.0μM,valproic acid_MCF7_6.0h_500.0μM,valproic acid_MCF7_6.0h_1000.0μM,valproic acid_PC3_6.0h_1000.0μM,vincamine_MCF7_6.0h_11.2μM,wortmannin_HL60_6.0h_0.01μM,wortmannin_MCF7_6.0h_0.01μM
15-delta prostaglandin J2_HL60_6.0h_10.0μM,1.0,0.398283,0.224435,-0.03306,-0.013305,-0.024279,-0.049886,-0.004499,0.008345,0.006105,...,0.13989,0.188219,0.228207,0.019276,0.037565,0.031933,0.038985,0.003405,0.107472,0.030201
15-delta prostaglandin J2_MCF7_6.0h_10.0μM,0.398283,1.0,0.347837,0.020978,-0.005801,0.039928,-0.005582,0.006454,0.058594,0.107138,...,-0.016228,9.2e-05,0.00812,0.106708,0.119428,0.096792,0.008434,0.028232,-0.061506,-0.016837
15-delta prostaglandin J2_PC3_6.0h_10.0μM,0.224435,0.347837,1.0,0.011506,-0.000319,0.009046,0.066527,-0.00818,0.073321,0.037073,...,-0.034181,-0.033036,-0.017461,-0.033827,-0.025956,-0.047849,0.127636,0.010882,-0.074845,-0.104564
3-hydroxy-DL-kynurenine_MCF7_6.0h_17.8μM,-0.03306,0.020978,0.011506,1.0,0.0,0.014638,0.000608,-0.009136,-0.003298,0.005422,...,0.02106,0.003184,-0.004028,-0.026977,0.024244,-0.001617,-0.021154,0.014875,0.003641,-0.015566
6-benzylaminopurine_MCF7_6.0h_17.8μM,-0.013305,-0.005801,-0.000319,0.0,1.0,-0.016854,0.015304,0.024833,0.036977,0.000326,...,-0.015913,-0.002983,-0.052491,0.005038,-0.016179,-0.016147,-0.025987,0.05572,0.029974,0.017889


# Save Attribute Similarity Matrix

In [45]:
filename = 'Output/cmap_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [46]:
gene_list = uf.createGeneList(normalized_matrix)

Progeres: 100%  11801 Out of 11801   

In [47]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,A1CF,29974
1,A2M,2
2,A4GALT,53947
3,A4GNT,51146
4,AAAS,8086


In [48]:
gene_list.shape

(11801, 2)

# Save Gene List

In [49]:
filename = 'Output/cmap_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [50]:
attribute_list = uf.createAttributeList(normalized_matrix)

In [51]:
attribute_list.head()

Unnamed: 0,Attributes
0,15-delta prostaglandin J2_HL60_6.0h_10.0μM
1,15-delta prostaglandin J2_MCF7_6.0h_10.0μM
2,15-delta prostaglandin J2_PC3_6.0h_10.0μM
3,3-hydroxy-DL-kynurenine_MCF7_6.0h_17.8μM
4,6-benzylaminopurine_MCF7_6.0h_17.8μM


In [52]:
attribute_list.shape

(200, 1)

# Save Attribute List

In [53]:
filename = 'Output/cmap_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [54]:
path = '/Users/moshesilverstein/Documents/Harmonizome/CMAP/Output/'

In [55]:
name = 'cmap_gene_attribute_edge_list'

In [56]:
uf.createGeneAttributeEdgeList(tertiary_matrix, gene_list, path, name)

Progeres: 100%  200 Out of 200   

 The number of statisticaly relevent gene-attribute associations is: 142745
