# JENSEN LAB (COMPARTMENTS)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://compartments.jensenlab.org/Search

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [80]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/my_functions.py'>

# LOAD DATA

In [11]:
col = ['Ensemble Acc', 'GeneSym', 'BTO', 'Compartment', 'Source', 'SampleInfo', 'Value']

In [12]:
dfE = pd.read_csv('Input/human_compartment_experiments_full.tsv', sep='\t', names=col)

In [13]:
dfK = pd.read_csv('Input/human_compartment_knowledge_full.tsv', sep='\t', names=col)

In [14]:
dfT = pd.read_csv('Input/human_compartment_textmining_full.tsv', sep='\t', names=col)

In [15]:
df = pd.concat([dfE, dfK, dfT])

In [16]:
df.head()

Unnamed: 0,Ensemble Acc,GeneSym,BTO,Compartment,Source,SampleInfo,Value
0,ENSP00000000442,ESRRA,GO:0005575,cellular_component,HPA,Weak: 1 antibody,1
1,ENSP00000000442,ESRRA,GO:0005622,Intracellular,HPA,Weak: 1 antibody,1
2,ENSP00000000442,ESRRA,GO:0005623,Cell,HPA,Weak: 1 antibody,1
3,ENSP00000000442,ESRRA,GO:0005634,Nucleus,HPA,Weak: 1 antibody,1
4,ENSP00000000442,ESRRA,GO:0005730,Nucleolus,HPA,Weak: 1 antibody,1


In [17]:
df.shape

(1375086, 7)

# Get Only Gene-Tissue Data

In [18]:
df = df[['GeneSym', 'Compartment']]

In [19]:
df.drop_duplicates(inplace=True)

In [20]:
df.set_index('GeneSym', inplace=True)

In [21]:
df.head()

Unnamed: 0_level_0,Compartment
GeneSym,Unnamed: 1_level_1
ESRRA,cellular_component
ESRRA,Intracellular
ESRRA,Cell
ESRRA,Nucleus
ESRRA,Nucleolus


In [22]:
df.shape

(847583, 1)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [23]:
mf.mapgenesymbols(df)

Progeres: 100%  847583 Out of 847583   

In [24]:
df.shape

(831825, 1)

# Create Binary Matrix

In [25]:
df.reset_index(inplace=True)

In [26]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  18535 Out of 18535   

In [27]:
binary_matrix.head()

Unnamed: 0,Chloroplast stroma,calcium- and calmodulin-dependent protein kinase complex,XY body,Acidocalcisome membrane,Anchored component of membrane,Diamine N-acetyltransferase complex,Membrane part,Scrib-APC-beta-catenin complex,Nuclear membrane,AIP1-IRE1 complex,...,Endosome,cell-cell contact zone,Nitrate reductase complex,protein-DNA-RNA complex,Sorting endosome,Longitudinal sarcoplasmic reticulum,XRCC2-RAD51D complex,MutLbeta complex,Serine C-palmitoyltransferase complex,DNA-directed RNA polymerase V complex
WDR91,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
ALOX5,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
SLC36A1,0,0,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
ZNF275,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BDNF,0,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [28]:
binary_matrix.shape

(18535, 2826)

# Save Binary Matrix

In [29]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [30]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/Output/'

In [31]:
name = 'jensen_compartments_gene_set'

In [32]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  2826 Out of 2826   

# Create Attribute Library

In [33]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/Output/'

In [34]:
name = 'jensen_compartments_attribute_set'

In [35]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18535 Out of 18535   

# Create Gene Similarity Matrix

In [36]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [37]:
gene_similarity_matix.head()

Unnamed: 0,WDR91,ALOX5,SLC36A1,ZNF275,BDNF,PRDM16,UPF3A,FMO2,JAK3,NFIB,...,LNPEP,ZNF510,OR7A5,PTPN21,SLC35G2,HNRNPR,LSS,KIF21B,PRR5,USF2
WDR91,1.0,0.121951,0.222222,0.4,0.048387,0.188679,0.188679,0.290323,0.126437,0.238095,...,0.141176,0.555556,0.136364,0.21875,0.310345,0.2,0.153846,0.155556,0.15625,0.276596
ALOX5,0.121951,1.0,0.207547,0.24359,0.219697,0.302083,0.28866,0.243902,0.355932,0.225806,...,0.261905,0.168831,0.115385,0.134831,0.265823,0.22,0.317308,0.14,0.11236,0.326087
SLC36A1,0.222222,0.207547,1.0,0.183333,0.129771,0.164706,0.192771,0.357143,0.207207,0.157895,...,0.33,0.207547,0.173077,0.153846,0.345455,0.21519,0.206522,0.173333,0.123077,0.28
ZNF275,0.4,0.24359,0.183333,1.0,0.077236,0.387755,0.387755,0.323529,0.157303,0.540541,...,0.159091,0.434783,0.111111,0.257143,0.264706,0.354167,0.212121,0.23913,0.135135,0.413043
BDNF,0.048387,0.219697,0.129771,0.077236,1.0,0.131274,0.122605,0.093117,0.210332,0.114625,...,0.202206,0.053061,0.036585,0.063241,0.093878,0.150794,0.13806,0.137097,0.059524,0.146245


## Save Gene Similarity Matrix 

In [38]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [39]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [40]:
attribute_similarity_matix.head()

Unnamed: 0,Chloroplast stroma,calcium- and calmodulin-dependent protein kinase complex,XY body,Acidocalcisome membrane,Anchored component of membrane,Diamine N-acetyltransferase complex,Membrane part,Scrib-APC-beta-catenin complex,Nuclear membrane,AIP1-IRE1 complex,...,Endosome,cell-cell contact zone,Nitrate reductase complex,protein-DNA-RNA complex,Sorting endosome,Longitudinal sarcoplasmic reticulum,XRCC2-RAD51D complex,MutLbeta complex,Serine C-palmitoyltransferase complex,DNA-directed RNA polymerase V complex
Chloroplast stroma,1.0,0.007168,0.0,0.0,0.003165,0.006711,0.007322,0.0,0.008111,0.0,...,0.004289,0.0,0.034483,0.0,0.0,0.014085,0.0,0.0,0.006494,0.0
calcium- and calmodulin-dependent protein kinase complex,0.007168,1.0,0.007782,0.006329,0.002865,0.0,0.014445,0.006329,0.020339,0.0,...,0.017032,0.032558,0.0,0.0,0.007273,0.023121,0.004878,0.0,0.0,0.0
XY body,0.0,0.007782,1.0,0.0,0.0,0.007874,0.003658,0.0,0.011933,0.0,...,0.000619,0.0,0.0,0.0,0.0,0.0,0.0,0.008929,0.0,0.0
Acidocalcisome membrane,0.0,0.006329,0.0,1.0,0.0,0.0,0.000246,0.0,0.001339,0.0,...,0.00066,0.001961,0.0,0.0,0.0,0.047619,0.0,0.0,0.0,0.0
Anchored component of membrane,0.003165,0.002865,0.0,0.0,1.0,0.004587,0.023236,0.0,0.009677,0.0,...,0.010651,0.021834,0.003597,0.0,0.009677,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [41]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [42]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  18535 Out of 18535   

In [43]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,WDR91,29062
1,ALOX5,240
2,SLC36A1,206358
3,ZNF275,10838
4,BDNF,627


In [44]:
gene_list.shape

(18535, 2)

### Save Gene List

In [45]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [46]:
attribute_list = mf.createAttributeList(binary_matrix)

In [47]:
attribute_list.head()

Unnamed: 0,Attributes
0,Chloroplast stroma
1,calcium- and calmodulin-dependent protein kina...
2,XY body
3,Acidocalcisome membrane
4,Anchored component of membrane


In [48]:
attribute_list.shape

(2826, 1)

### Save Attribute List

In [49]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [52]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/Output/'

In [53]:
name = 'jensen_compartments_gene_attribute_edge_list'

In [81]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  2826 Out of 2826   

 The number of statisticaly relevent gene-attribute associations is: 829693
