# Pathway Commons Pathways

Author: Moshe Silverstein  
Date: 08-18  
Data Source Home: https://www.pathwaycommons.org/    
Data Source Download: http://www.pathwaycommons.org/archives/PC2/v10/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Pathwaycommons/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/PathwayCommons10.All.hgnc.txt', sep='\t')

In [5]:
df.head()

Unnamed: 0,PARTICIPANT_A,INTERACTION_TYPE,PARTICIPANT_B,INTERACTION_DATA_SOURCE,INTERACTION_PUBMED_ID,PATHWAY_NAMES,MEDIATOR_IDS
0,A1BG,controls-expression-of,A2M,pid,12456685;7678052;9794795,IL6-mediated signaling events,http://pathwaycommons.org/pc2/TemplateReaction...
1,A1BG,interacts-with,ABCC6,BioGRID,21988832,,http://pathwaycommons.org/pc2/MolecularInterac...
2,A1BG,interacts-with,ACE2,BIND,15791205,,http://pathwaycommons.org/pc2/MolecularInterac...
3,A1BG,interacts-with,ADAM10,BIND,15280379,,http://pathwaycommons.org/pc2/MolecularInterac...
4,A1BG,interacts-with,ADAM17,BIND,15280379,,http://pathwaycommons.org/pc2/MolecularInterac...


In [6]:
df.shape

(2407583, 7)

# Get relevant data

In [7]:
df1 = df[['PARTICIPANT_A', 'PATHWAY_NAMES']]

In [8]:
df1.columns = ['Gene', 'Pathway']

In [9]:
df1.shape

(2407583, 2)

In [10]:
df2 = df[['PARTICIPANT_B', 'PATHWAY_NAMES']]

In [11]:
df2.columns = ['Gene', 'Pathway']

In [12]:
df2.shape

(2407583, 2)

In [13]:
df = pd.concat([df1,df2])

In [14]:
df.reset_index(inplace=True)

In [15]:
df.drop('index', axis=1, inplace=True)

In [16]:
df.head()

Unnamed: 0,Gene,Pathway
0,A1BG,IL6-mediated signaling events
1,A1BG,
2,A1BG,
3,A1BG,
4,A1BG,


In [17]:
df.shape

(4815166, 2)

# Drop NAs

In [18]:
df.dropna(how='any', inplace=True)

In [19]:
df.head()

Unnamed: 0,Gene,Pathway
0,A1BG,IL6-mediated signaling events
7,A1BG,IL6-mediated signaling events;Integrin-linked ...
8,A1BG,IL6-mediated signaling events;Integrin-linked ...
26,A1BG,IL6-mediated signaling events
41,A1BG,IL6-mediated signaling events


In [20]:
df.shape

(951058, 2)

# Drop Duplicates

In [21]:
df.drop_duplicates(inplace=True)

In [22]:
df.head()

Unnamed: 0,Gene,Pathway
0,A1BG,IL6-mediated signaling events
7,A1BG,IL6-mediated signaling events;Integrin-linked ...
70,A1CF,Formation of the Editosome;mRNA Editing: C to ...
132,A2M,HDL assembly
145,A2M,IL-6 signaling pathway


In [23]:
df.shape

(148973, 2)

# Split Pathway Data

In [24]:
genes = []
pathways = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    pathway = df.loc[index, 'Pathway'].split(';')
    gene = [df.loc[index, 'Gene']]*len(pathway)
    
    pathways.extend(pathway)
    genes.extend(gene)
    
df = pd.DataFrame(columns=['Gene', 'Pathway'])

df['Gene'] = genes
df['Pathway'] = pathways

Progress: 100%  148973 Out of 148973   

In [25]:
df.head()

Unnamed: 0,Gene,Pathway
0,A1BG,IL6-mediated signaling events
1,A1BG,IL6-mediated signaling events
2,A1BG,Integrin-linked kinase signaling
3,A1CF,Formation of the Editosome
4,A1CF,mRNA Editing: C to U Conversion


In [26]:
df.shape

(27123782, 2)

# Drop Duplicates

In [27]:
df.drop_duplicates(inplace=True)

In [28]:
df.head()

Unnamed: 0,Gene,Pathway
0,A1BG,IL6-mediated signaling events
2,A1BG,Integrin-linked kinase signaling
3,A1CF,Formation of the Editosome
4,A1CF,mRNA Editing: C to U Conversion
5,A2M,HDL assembly


In [29]:
df.shape

(17412643, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [30]:
df.set_index('Gene', inplace=True)

In [31]:
uf.mapgenesymbols(df)

Progeres: 99%  17412449 Out of 17412643   

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



# Drop Duplicates

In [32]:
df.reset_index(inplace=True)

In [35]:
df.drop_duplicates(inplace=True)

In [36]:
df.head()

Unnamed: 0,Gene,Pathway
0,A1BG,IL6-mediated signaling events
1,A1BG,Integrin-linked kinase signaling
2,A1CF,Formation of the Editosome
3,A1CF,mRNA Editing: C to U Conversion
4,A2M,HDL assembly


In [37]:
df.shape

(1257932, 2)

# Create Binary Matrix

In [38]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  11164 Out of 11164   

In [39]:
binary_matrix.head()

Unnamed: 0,De Novo Triacylglycerol Biosynthesis TG(a-21:0/i-24:0/i-17:0),"Cardiolipin Biosynthesis CL(i-13:0/18:2(9Z,11Z)/a-17:0/i-15:0)",De Novo Triacylglycerol Biosynthesis TG(i-20:0/22:0/10:0),"De Novo Triacylglycerol Biosynthesis TG(22:1(13Z)/16:1(9Z)/22:2(13Z,16Z))",Cardiolipin Biosynthesis CL(a-13:0/i-12:0/i-14:0/i-16:0),"De Novo Triacylglycerol Biosynthesis TG(20:3(5Z,8Z,11Z)/24:1(15Z)/20:5(5Z,8Z,11Z,14Z,17Z))",Cardiolipin Biosynthesis CL(a-13:0/a-21:0/a-25:0/a-15:0),Phosphatidylethanolamine Biosynthesis PE(16:1(9Z)/22:1(13Z)),"Cardiolipin Biosynthesis CL(22:5(4Z,7Z,10Z,13Z,16Z)/18:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z)/18:0)",Cardiolipin Biosynthesis CL(i-12:0/i-19:0/i-18:0/a-21:0),...,"De Novo Triacylglycerol Biosynthesis TG(15:0/15:0/22:5(7Z,10Z,13Z,16Z,19Z))","Cardiolipin Biosynthesis CL(i-13:0/i-19:0/18:2(9Z,11Z)/i-14:0)",Cardiolipin Biosynthesis CL(i-13:0/a-25:0/a-21:0/i-14:0),Cardiolipin Biosynthesis CL(i-12:0/a-25:0/a-15:0/i-13:0),Cardiolipin Biosynthesis CL(i-13:0/i-24:0/i-21:0/i-17:0),De Novo Triacylglycerol Biosynthesis TG(a-25:0/i-21:0/i-18:0),"De Novo Triacylglycerol Biosynthesis TG(16:1(9Z)/18:3(6Z,9Z,12Z)/18:2(9Z,12Z))",Cardiolipin Biosynthesis CL(i-12:0/i-17:0/a-15:0/i-17:0),"De Novo Triacylglycerol Biosynthesis TG(20:0/24:0/18:3(9Z,12Z,15Z))","De Novo Triacylglycerol Biosynthesis TG(14:1(9Z)/18:3(6Z,9Z,12Z)/14:1(9Z))"
PIGB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CEACAM5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNF551,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GRIA4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RAB8B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
binary_matrix.shape

(11164, 51319)

# Save Binary Matrix

In [41]:
filename = path+'pathway_commons_pathways_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [42]:
name = 'pathway_commons_pathways_gene_set'

In [43]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  51319 Out of 51319   

# Create Attribute Library

In [44]:
name = 'pathway_commons_pathways_attribute_set'

In [45]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  11164 Out of 11164   

# Create Gene List

In [46]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  11164 Out of 11164   

In [47]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,PIGB,9488
1,CEACAM5,1048
2,ZNF551,90233
3,GRIA4,2893
4,RAB8B,51762


In [48]:
gene_list.shape

(11164, 2)

# Save Gene List

In [49]:
filename = path+'pathway_commons_pathways_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [54]:
attribute_list = uf.createAttributeList(binary_matrix)

In [55]:
attribute_list.head()

De Novo Triacylglycerol Biosynthesis TG(a-21:0/i-24:0/i-17:0)
"Cardiolipin Biosynthesis CL(i-13:0/18:2(9Z,11Z)/a-17:0/i-15:0)"
De Novo Triacylglycerol Biosynthesis TG(i-20:0/22:0/10:0)
"De Novo Triacylglycerol Biosynthesis TG(22:1(13Z)/16:1(9Z)/22:2(13Z,16Z))"
Cardiolipin Biosynthesis CL(a-13:0/i-12:0/i-14:0/i-16:0)


In [56]:
attribute_list.shape

(51319, 0)

# Save Attribute List

In [57]:
filename = path+'pathway_commons_pathways_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [58]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [59]:
gene_similarity_matix.head()

Unnamed: 0,PIGB,CEACAM5,ZNF551,GRIA4,RAB8B,TRAPPC9,SKAP1,CEP83,NRBF2,ATP7A,...,SNX1,ERCC6L,MTMR10,RIN2,TAGLN,GPCPD1,ARHGEF12,LHX2,ENAH,MFSD4B
,,,,,,,,,,,,,,,,,,,,,
PIGB,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CEACAM5,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZNF551,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
GRIA4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.015873,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2
RAB8B,0.0,0.0,0.0,0.0,1.0,0.2,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
gene_similarity_matix.shape

(11164, 11164)

# Save Gene Similarity Matrix

In [61]:
filename = path+'pathway_commons_pathways_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [64]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [65]:
attribute_similarity_matix.head()

Unnamed: 0,De Novo Triacylglycerol Biosynthesis TG(a-21:0/i-24:0/i-17:0),"Cardiolipin Biosynthesis CL(i-13:0/18:2(9Z,11Z)/a-17:0/i-15:0)",De Novo Triacylglycerol Biosynthesis TG(i-20:0/22:0/10:0),"De Novo Triacylglycerol Biosynthesis TG(22:1(13Z)/16:1(9Z)/22:2(13Z,16Z))",Cardiolipin Biosynthesis CL(a-13:0/i-12:0/i-14:0/i-16:0),"De Novo Triacylglycerol Biosynthesis TG(20:3(5Z,8Z,11Z)/24:1(15Z)/20:5(5Z,8Z,11Z,14Z,17Z))",Cardiolipin Biosynthesis CL(a-13:0/a-21:0/a-25:0/a-15:0),Phosphatidylethanolamine Biosynthesis PE(16:1(9Z)/22:1(13Z)),"Cardiolipin Biosynthesis CL(22:5(4Z,7Z,10Z,13Z,16Z)/18:0/22:6(4Z,7Z,10Z,13Z,16Z,19Z)/18:0)",Cardiolipin Biosynthesis CL(i-12:0/i-19:0/i-18:0/a-21:0),...,"De Novo Triacylglycerol Biosynthesis TG(15:0/15:0/22:5(7Z,10Z,13Z,16Z,19Z))","Cardiolipin Biosynthesis CL(i-13:0/i-19:0/18:2(9Z,11Z)/i-14:0)",Cardiolipin Biosynthesis CL(i-13:0/a-25:0/a-21:0/i-14:0),Cardiolipin Biosynthesis CL(i-12:0/a-25:0/a-15:0/i-13:0),Cardiolipin Biosynthesis CL(i-13:0/i-24:0/i-21:0/i-17:0),De Novo Triacylglycerol Biosynthesis TG(a-25:0/i-21:0/i-18:0),"De Novo Triacylglycerol Biosynthesis TG(16:1(9Z)/18:3(6Z,9Z,12Z)/18:2(9Z,12Z))",Cardiolipin Biosynthesis CL(i-12:0/i-17:0/a-15:0/i-17:0),"De Novo Triacylglycerol Biosynthesis TG(20:0/24:0/18:3(9Z,12Z,15Z))","De Novo Triacylglycerol Biosynthesis TG(14:1(9Z)/18:3(6Z,9Z,12Z)/14:1(9Z))"
,,,,,,,,,,,,,,,,,,,,,
De Novo Triacylglycerol Biosynthesis TG(a-21:0/i-24:0/i-17:0),1.0,0.833333,1.0,1.0,0.833333,1.0,0.833333,0.081081,0.833333,0.833333,...,1.0,0.833333,0.833333,0.833333,0.833333,1.0,1.0,0.833333,1.0,1.0
"Cardiolipin Biosynthesis CL(i-13:0/18:2(9Z,11Z)/a-17:0/i-15:0)",0.833333,1.0,0.833333,0.833333,1.0,0.833333,1.0,0.052632,1.0,1.0,...,0.833333,1.0,1.0,1.0,1.0,0.833333,0.833333,1.0,0.833333,0.833333
De Novo Triacylglycerol Biosynthesis TG(i-20:0/22:0/10:0),1.0,0.833333,1.0,1.0,0.833333,1.0,0.833333,0.081081,0.833333,0.833333,...,1.0,0.833333,0.833333,0.833333,0.833333,1.0,1.0,0.833333,1.0,1.0
"De Novo Triacylglycerol Biosynthesis TG(22:1(13Z)/16:1(9Z)/22:2(13Z,16Z))",1.0,0.833333,1.0,1.0,0.833333,1.0,0.833333,0.081081,0.833333,0.833333,...,1.0,0.833333,0.833333,0.833333,0.833333,1.0,1.0,0.833333,1.0,1.0
Cardiolipin Biosynthesis CL(a-13:0/i-12:0/i-14:0/i-16:0),0.833333,1.0,0.833333,0.833333,1.0,0.833333,1.0,0.052632,1.0,1.0,...,0.833333,1.0,1.0,1.0,1.0,0.833333,0.833333,1.0,0.833333,0.833333


In [66]:
attribute_similarity_matix.shape

(51319, 51319)

# Save Attribute Similarity Matrix

In [67]:
filename = path+'pathway_commons_pathways_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [62]:
name = 'pathway_commons_pathways_gene_attribute_edge_list'

In [63]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  51319 Out of 51319   

 The number of statisticaly relevent gene-attribute associations is: 1257932
