# Reactome

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: http://reactome.org/pages/download-data/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/Reactome/untility_functions.py'>

# Load Data

In [18]:
df = pd.read_csv('ReactomePathways.gmt',
                 sep='%%',
                 header=None)

  app.launch_new_instance()


In [19]:
df.shape

(1892, 1)

In [20]:
df.head()

Unnamed: 0,0
0,NS1 Mediated Effects on Host Pathways\tR-HSA-1...
1,2-LTR circle formation\tR-HSA-164843\tReactome...
2,3' -UTR-mediated translational regulation\tR-H...
3,5-Phosphoribose 1-diphosphate biosynthesis\tR-...
4,A tetrasaccharide linker sequence is required ...


In [23]:
df_pathways = pd.DataFrame()

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    temp = pd.DataFrame()
    genes = df.ix[index, 0].split('\t')[3:]
    pathway = [df.ix[index, 0].split('\t')[0]]*len(genes)
    temp['Pathways'] = pathway
    temp['Genes'] = genes
    df_pathways = pd.concat([df_pathways, temp])

Progress: 100%  1892 Out of 1892   

In [24]:
df_pathways.head()

Unnamed: 0,Pathways,Genes
0,NS1 Mediated Effects on Host Pathways,AAAS
1,NS1 Mediated Effects on Host Pathways,CPSF4
2,NS1 Mediated Effects on Host Pathways,EIF2AK2
3,NS1 Mediated Effects on Host Pathways,ISG15
4,NS1 Mediated Effects on Host Pathways,KPNA1


In [25]:
df_pathways.shape

(106705, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [26]:
df_pathways.set_index('Genes', inplace=True)

In [27]:
uf.mapgenesymbols(df_pathways)

Progeres: 99%  105891 Out of 106705   

In [29]:
df_pathways.shape

(105632, 1)

# Drop Duplicates

In [30]:
df_pathways.reset_index(inplace=True)

In [31]:
df_pathways.drop_duplicates(subset=['Genes', 'Pathways'], inplace=True)

In [32]:
df_pathways.shape

(105556, 2)

In [33]:
df_pathways.head()

Unnamed: 0,Genes,Pathways
0,AAAS,NS1 Mediated Effects on Host Pathways
1,CPSF4,NS1 Mediated Effects on Host Pathways
2,EIF2AK2,NS1 Mediated Effects on Host Pathways
3,ISG15,NS1 Mediated Effects on Host Pathways
4,KPNA1,NS1 Mediated Effects on Host Pathways


# Create Binary Matrix

In [34]:
binary_matrix = uf.createBinaryMatix(df_pathways)

Progeres: 100%  10237 Out of 10237   

In [35]:
binary_matrix.head()

Unnamed: 0,RNA polymerase II transcribes snRNA genes,Common Pathway of Fibrin Clot Formation,Extracellular matrix organization,Ion channel transport,Intra-Golgi and retrograde Golgi-to-ER traffic,Heme biosynthesis,Metabolism of polyamines,Early Phase of HIV Life Cycle,Synthesis of IP3 and IP4 in the cytosol,SMAC binds to IAPs,...,PI and PC transport between ER and Golgi membranes,Displacement of DNA glycosylase by APEX1,Negative regulation of FGFR1 signaling,Depyrimidination,Cross-presentation of soluble exogenous antigens (endosomes),Hh mutants abrogate ligand secretion,Oxidative Stress Induced Senescence,Defective GALNT3 causes familial hyperphosphatemic tumoral calcinosis (HFTC),Asparagine N-linked glycosylation,IRS-mediated signalling
NOD1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SCN11A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CENPS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CD320,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ART3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
binary_matrix.shape

(10237, 1887)

# Save Binary Matrix

In [38]:
filename = '~/./Documents/Harmonizome/Reactome/Output/reactome_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [39]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Reactome/Output/'

In [40]:
name = 'reactome_gene_set'

In [41]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1887 Out of 1887   

# Create Attribute Library

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Reactome/Output/'

In [43]:
name = 'reactome_attribute_set'

In [44]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  10237 Out of 10237   

# Create Gene Similarity Matrix

In [45]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [46]:
gene_similarity_matix.head()

Unnamed: 0,NOD1,SCN11A,CENPS,CD320,ART3,CLEC4E,ELOA3D,PLA2G4F,COA4,KCNF1,...,TBC1D10C,PPP6R1,HTR3D,LEP,TADA2B,TBL1X,IDI2,YY1,GDA,GCHFR
NOD1,1.0,0.0,0.0,0.0,0.058824,0.057143,0.0,0.0,0.029412,0.0,...,0.054054,0.051282,0.0,0.02439,0.081081,0.0,0.0,0.066667,0.0,0.0
SCN11A,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.066667,0.0,0.026316,0.0,0.047619,0.0,0.0
CENPS,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.04,0.0,0.020833,0.0,0.032258,0.0,0.0
CD320,0.0,0.0,0.0,1.0,0.0,0.0,0.032258,0.055556,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.051282,0.090909,0.0,0.083333,0.083333
ART3,0.058824,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.25,0.0,...,0.0,0.222222,0.0,0.090909,0.25,0.0,0.0,0.125,0.0,0.0


## Save Gene Similarity Matrix 

In [47]:
filename = '~/./Documents/Harmonizome/Reactome/Output/reactome_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [48]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [49]:
attribute_similarity_matix.head()

Unnamed: 0,RNA polymerase II transcribes snRNA genes,Common Pathway of Fibrin Clot Formation,Extracellular matrix organization,Ion channel transport,Intra-Golgi and retrograde Golgi-to-ER traffic,Heme biosynthesis,Metabolism of polyamines,Early Phase of HIV Life Cycle,Synthesis of IP3 and IP4 in the cytosol,SMAC binds to IAPs,...,PI and PC transport between ER and Golgi membranes,Displacement of DNA glycosylase by APEX1,Negative regulation of FGFR1 signaling,Depyrimidination,Cross-presentation of soluble exogenous antigens (endosomes),Hh mutants abrogate ligand secretion,Oxidative Stress Induced Senescence,Defective GALNT3 causes familial hyperphosphatemic tumoral calcinosis (HFTC),Asparagine N-linked glycosylation,IRS-mediated signalling
RNA polymerase II transcribes snRNA genes,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Common Pathway of Fibrin Clot Formation,0.0,1.0,0.00974,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006667,0.009804
Extracellular matrix organization,0.0,0.00974,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003413,...,0.0,0.0,0.003115,0.0,0.0,0.0,0.0,0.0,0.003527,0.017668
Ion channel transport,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.004348,0.0,...,0.0,0.0,0.017094,0.0,0.0,0.015504,0.013514,0.0,0.008316,0.020747
Intra-Golgi and retrograde Golgi-to-ER traffic,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.15869,0.0


## Save Attribute Similarity Matrix

In [50]:
filename = '~/./Documents/Harmonizome/Reactome/Output/reactome_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [51]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  10237 Out of 10237   

In [52]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,NOD1,10392
1,SCN11A,11280
2,CENPS,378708
3,CD320,51293
4,ART3,419


In [53]:
gene_list.shape

(10237, 2)

## Save Gene List

In [54]:
filename = '~/./Documents/Harmonizome/Reactome/Output/reactome_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [55]:
attribute_list = uf.createAttributeList(binary_matrix)

In [56]:
attribute_list.head()

Unnamed: 0,Attributes
0,RNA polymerase II transcribes snRNA genes
1,Common Pathway of Fibrin Clot Formation
2,Extracellular matrix organization
3,Ion channel transport
4,Intra-Golgi and retrograde Golgi-to-ER traffic


In [57]:
attribute_list.shape

(1887, 1)

## Save Attribute List

In [58]:
filename = '~/./Documents/Harmonizome/Reactome/Output/reactome_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [59]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Reactome/Output/'

In [60]:
name = 'reactome_gene_attribute_edge_list'

In [61]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  1887 Out of 1887   

 The number of statisticaly relevent gene-attribute associations is: 105556
