# JENSEN LAB (COMPARTMENTS)

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://compartments.jensenlab.org/Search

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/untility_functions.py'>

# LOAD DATA

In [3]:
col = ['Ensemble Acc', 'GeneSym', 'BTO', 'Compartment', 'Source', 'SampleInfo', 'Value']

In [4]:
dfE = pd.read_csv('Input/human_compartment_experiments_full.tsv', sep='\t', names=col)

In [5]:
dfK = pd.read_csv('Input/human_compartment_knowledge_full.tsv', sep='\t', names=col)

In [6]:
dfT = pd.read_csv('Input/human_compartment_textmining_full.tsv', sep='\t', names=col)

In [7]:
df = pd.concat([dfE, dfK, dfT])

In [8]:
df.head()

Unnamed: 0,Ensemble Acc,GeneSym,BTO,Compartment,Source,SampleInfo,Value
0,ENSP00000000442,ESRRA,GO:0005575,cellular_component,HPA,Weak: 1 antibody,1
1,ENSP00000000442,ESRRA,GO:0005622,Intracellular,HPA,Weak: 1 antibody,1
2,ENSP00000000442,ESRRA,GO:0005623,Cell,HPA,Weak: 1 antibody,1
3,ENSP00000000442,ESRRA,GO:0005634,Nucleus,HPA,Weak: 1 antibody,1
4,ENSP00000000442,ESRRA,GO:0005730,Nucleolus,HPA,Weak: 1 antibody,1


In [9]:
df.shape

(1375086, 7)

# Get Only Gene-Tissue Data

In [10]:
df = df[df['Value'] != 0]

In [11]:
df = df[['GeneSym', 'Compartment']]

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.set_index('GeneSym', inplace=True)

In [14]:
df.head()

Unnamed: 0_level_0,Compartment
GeneSym,Unnamed: 1_level_1
ESRRA,cellular_component
ESRRA,Intracellular
ESRRA,Cell
ESRRA,Nucleus
ESRRA,Nucleolus


In [15]:
df.shape

(847583, 1)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [16]:
uf.mapgenesymbols(df)

Progeres: 100%  847583 Out of 847583   

In [17]:
df.shape

(831825, 1)

# Create Binary Matrix

In [18]:
df.reset_index(inplace=True)

In [19]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  18535 Out of 18535   

In [20]:
binary_matrix.head()

Unnamed: 0,Presynaptic active zone cytoplasmic component,Cytoplasmic exosome (RNase complex),ERBB4-ERBB3 complex,Fc receptor complex,CENP-A recruiting complex,Signal recognition particle,Integrin alphav-beta3 complex,Mitochondrial part,Pexophagosome,ferredoxin:thioredoxin reductase complex,...,Mucocyst,Peroxisomal membrane,Axoneme,Transport vesicle,Intramolecular phosphotransferase complex,interleukin-3 receptor complex,Membrane attack complex,GO:0033644,Proteasome core complex,"Synaptic vesicle, recycling pool"
LRCH3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KLHDC8A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SLC5A8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
MAATS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ETV4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
binary_matrix.shape

(18535, 2826)

# Save Binary Matrix

In [22]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [23]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/Output/'

In [24]:
name = 'jensen_compartments_gene_set'

In [25]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  2826 Out of 2826   

# Create Attribute Library

In [26]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/Output/'

In [27]:
name = 'jensen_compartments_attribute_set'

In [28]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  18535 Out of 18535   

# Create Gene Similarity Matrix

In [29]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [30]:
gene_similarity_matix.head()

Unnamed: 0,LRCH3,KLHDC8A,SLC5A8,MAATS1,ETV4,ZMYM2,ITGB1BP2,GRAMD1B,PSMG1,SYNE2,...,TRIM67,MYLK4,TM9SF2,RPTN,ADCY2,NFAT5,ABHD14B,GJB3,ETV3L,B3GNT2
LRCH3,1.0,0.3125,0.103448,0.5,0.129032,0.089888,0.275862,0.147059,0.233333,0.065574,...,0.179487,0.095238,0.186047,0.173913,0.166667,0.163265,0.3,0.122807,0.277778,0.135593
KLHDC8A,0.3125,1.0,0.135593,0.263158,0.106061,0.075269,0.142857,0.135135,0.142857,0.055556,...,0.166667,0.04,0.102041,0.111111,0.125,0.176471,0.135135,0.116667,0.238095,0.09375
SLC5A8,0.103448,0.135593,1.0,0.155172,0.26087,0.222222,0.136986,0.328125,0.276923,0.257143,...,0.194805,0.169492,0.385714,0.089552,0.189189,0.355263,0.393443,0.264368,0.169492,0.269663
MAATS1,0.5,0.263158,0.155172,1.0,0.177419,0.136364,0.290323,0.272727,0.37931,0.099174,...,0.289474,0.130435,0.255814,0.111111,0.153846,0.22449,0.354839,0.218182,0.529412,0.186441
ETV4,0.129032,0.106061,0.26087,0.177419,1.0,0.241667,0.171053,0.378788,0.289855,0.246575,...,0.272727,0.071429,0.197674,0.144928,0.13253,0.415584,0.358209,0.274725,0.171875,0.214286


## Save Gene Similarity Matrix 

In [31]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [32]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [33]:
attribute_similarity_matix.head()

Unnamed: 0,Presynaptic active zone cytoplasmic component,Cytoplasmic exosome (RNase complex),ERBB4-ERBB3 complex,Fc receptor complex,CENP-A recruiting complex,Signal recognition particle,Integrin alphav-beta3 complex,Mitochondrial part,Pexophagosome,ferredoxin:thioredoxin reductase complex,...,Mucocyst,Peroxisomal membrane,Axoneme,Transport vesicle,Intramolecular phosphotransferase complex,interleukin-3 receptor complex,Membrane attack complex,GO:0033644,Proteasome core complex,"Synaptic vesicle, recycling pool"
Presynaptic active zone cytoplasmic component,1.0,0.005348,0.005747,0.005698,0.0,0.011142,0.0,0.015401,0.0,0.0,...,0.0,0.0,0.008711,0.046171,0.0,0.0,0.0,0.0,0.0,0.005814
Cytoplasmic exosome (RNase complex),0.005348,1.0,0.0,0.0,0.0,0.004673,0.0,0.002273,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ERBB4-ERBB3 complex,0.005747,0.0,1.0,0.005236,0.0,0.0,0.0,0.002864,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fc receptor complex,0.005698,0.0,0.005236,1.0,0.0,0.021505,0.031847,0.016869,0.0,0.0,...,0.0,0.0,0.001681,0.015021,0.004065,0.064516,0.094395,0.0,0.004988,0.0
CENP-A recruiting complex,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.000567,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.004854,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [34]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [35]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  18535 Out of 18535   

In [36]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,LRCH3,84859
1,KLHDC8A,55220
2,SLC5A8,160728
3,MAATS1,89876
4,ETV4,2118


In [37]:
gene_list.shape

(18535, 2)

### Save Gene List

In [38]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [39]:
attribute_list = uf.createAttributeList(binary_matrix)

In [40]:
attribute_list.head()

Unnamed: 0,Attributes
0,Presynaptic active zone cytoplasmic component
1,Cytoplasmic exosome (RNase complex)
2,ERBB4-ERBB3 complex
3,Fc receptor complex
4,CENP-A recruiting complex


In [41]:
attribute_list.shape

(2826, 1)

### Save Attribute List

In [42]:
filename = '~/./Documents/Harmonizome/Jensen/Compartments/Output/jensen_compartments_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [43]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Jensen/Compartments/Output/'

In [44]:
name = 'jensen_compartments_gene_attribute_edge_list'

In [45]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  2826 Out of 2826   

 The number of statisticaly relevent gene-attribute associations is: 829693
