# Pathway Commons Protein-Protein Interactions

Author: Moshe Silverstein  
Date: 08-18  
Data Source Home: https://www.pathwaycommons.org/    
Data Source Download: http://www.pathwaycommons.org/archives/PC2/v10/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import utility_functions as uf
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter

In [2]:
import seaborn as sns
sns.set(color_codes=True)
np.random.seed(sum(map(ord, "distributions")))

# Path to Output Files

In [3]:
path = '/Users/moshesilverstein/Documents/Harmonizome/Pathwaycommons/Output/'

# Load Data

In [4]:
df = pd.read_csv('Input/PathwayCommons10.All.hgnc.txt', sep='\t')

In [5]:
df.head()

Unnamed: 0,PARTICIPANT_A,INTERACTION_TYPE,PARTICIPANT_B,INTERACTION_DATA_SOURCE,INTERACTION_PUBMED_ID,PATHWAY_NAMES,MEDIATOR_IDS
0,A1BG,controls-expression-of,A2M,pid,12456685;7678052;9794795,IL6-mediated signaling events,http://pathwaycommons.org/pc2/TemplateReaction...
1,A1BG,interacts-with,ABCC6,BioGRID,21988832,,http://pathwaycommons.org/pc2/MolecularInterac...
2,A1BG,interacts-with,ACE2,BIND,15791205,,http://pathwaycommons.org/pc2/MolecularInterac...
3,A1BG,interacts-with,ADAM10,BIND,15280379,,http://pathwaycommons.org/pc2/MolecularInterac...
4,A1BG,interacts-with,ADAM17,BIND,15280379,,http://pathwaycommons.org/pc2/MolecularInterac...


In [6]:
df.shape

(2407583, 7)

# Get relevant data

In [13]:
df = df[['PARTICIPANT_A', 'PARTICIPANT_B']]

In [14]:
df.head()

Unnamed: 0,PARTICIPANT_A,PARTICIPANT_B
0,A1BG,A2M
1,A1BG,ABCC6
2,A1BG,ACE2
3,A1BG,ADAM10
4,A1BG,ADAM17


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [15]:
df.set_index('PARTICIPANT_A', inplace=True)

In [16]:
uf.mapgenesymbols(df)

Progeres: 100%  2407583 Out of 2407583   

In [17]:
df.reset_index(inplace=True)

In [18]:
df.set_index('PARTICIPANT_B', inplace=True)

In [19]:
uf.mapgenesymbols(df)

Progeres: 100%  1274230 Out of 1274230   

# Drop Duplicates

In [20]:
df.reset_index(inplace=True)

In [21]:
df.drop_duplicates(inplace=True)

In [22]:
df.head()

Unnamed: 0,PARTICIPANT_B,PARTICIPANT_A
0,A2M,A1BG
1,ABCC6,A1BG
2,ACE2,A1BG
3,ADAM10,A1BG
4,ADAM17,A1BG


In [23]:
df.shape

(1125042, 2)

# Create Binary Matrix

In [24]:
binary_matrix = uf.createBinaryMatrix(df[['PARTICIPANT_A', 'PARTICIPANT_B']])

Progeres: 100%  16291 Out of 16291   

In [25]:
binary_matrix.head()

Unnamed: 0,MEDAG,CTNNAL1,OR5M9,DGCR6L,FAT1,USP30,C2CD6,LATS2,CRYAA,MXD3,...,SLC39A7,CAPG,OTUD6B,TOB2,RPL27A,XKR4,CEP104,CCAR1,NEK11,UBIAD1
MEDAG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CTNNAL1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DGCR6L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAT1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
USP30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
binary_matrix.shape

(16291, 18511)

# Save Binary Matrix

In [31]:
filename = path+'pathway_commons_ppi_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [32]:
name = 'pathway_commons_ppi_gene_set'

In [33]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  18511 Out of 18511   

# Create Attribute Library

In [34]:
name = 'pathway_commons_ppi_attribute_set'

In [35]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  16291 Out of 16291   

# Create Gene List

In [36]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  16291 Out of 16291   

In [37]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,MEDAG,84935
1,CTNNAL1,8727
2,DGCR6L,85359
3,FAT1,2195
4,USP30,84749


In [38]:
gene_list.shape

(16291, 2)

# Save Gene List

In [39]:
filename = path+'pathway_commons_ppi_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [40]:
attribute_list = uf.createGeneList(binary_matrix.T)

Progeres: 100%  18511 Out of 18511   

In [41]:
attribute_list.head()

Unnamed: 0,GeneSym,GeneID
0,MEDAG,84935
1,CTNNAL1,8727
2,OR5M9,390162
3,DGCR6L,85359
4,FAT1,2195


In [42]:
attribute_list.shape

(18511, 2)

# Save Attribute List

In [43]:
filename = path+'pathway_commons_ppi_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute Similarity matrix

In [44]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [45]:
attribute_similarity_matix.head()

Unnamed: 0,MEDAG,CTNNAL1,OR5M9,DGCR6L,FAT1,USP30,C2CD6,LATS2,CRYAA,MXD3,...,SLC39A7,CAPG,OTUD6B,TOB2,RPL27A,XKR4,CEP104,CCAR1,NEK11,UBIAD1
,,,,,,,,,,,,,,,,,,,,,
MEDAG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.012987,0.019608,0.0,...,0.0,0.0,0.0,0.0,0.006173,0.166667,0.0,0.074074,0.071429,0.0
CTNNAL1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.008696,0.0,0.0,...,0.0,0.0,0.0,0.0,0.01005,0.0,0.0,0.0,0.0,0.0
OR5M9,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DGCR6L,0.0,0.0,0.0,1.0,0.019608,0.0,0.0,0.0,0.019231,0.029412,...,0.009709,0.083333,0.0,0.066667,0.0,0.0,0.0,0.0,0.0,0.086957
FAT1,0.0,0.0,0.0,0.019608,1.0,0.010989,0.0,0.025862,0.01087,0.0,...,0.028571,0.039216,0.0,0.028571,0.02,0.0,0.0,0.0,0.018182,0.015625


In [46]:
attribute_similarity_matix.shape

(18511, 18511)

# Save Attribute Similarity Matrix

In [47]:
filename = path+'pathway_commons_ppi_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Similarity Matrix

In [48]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [49]:
gene_similarity_matix.head()

Unnamed: 0,MEDAG,CTNNAL1,DGCR6L,FAT1,USP30,C2CD6,CRYAA,LATS2,MOCS1,EPS8,...,SLC39A7,CAPG,OTUD6B,TOB2,RPL27A,PSG2,CEP104,CCAR1,NEK11,UBIAD1
,,,,,,,,,,,,,,,,,,,,,
MEDAG,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
CTNNAL1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.002865,0.0,0.0,0.0,0.004878,0.0,0.007092,0.0,0.0,0.0
DGCR6L,0.0,0.0,1.0,0.0,0.0,0.05,0.0,0.013889,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAT1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.018868,0.0,0.011364,...,0.003322,0.0,0.0,0.0,0.006369,0.0,0.0,0.0,0.0,0.0
USP30,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
gene_similarity_matix.shape

(16291, 16291)

# Save Gene Similarity Matrix

In [51]:
filename = path+'pathway_commons_ppi_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene-Attribute Edge List

In [52]:
name = 'pathway_commons_ppi_gene_attribute_edge_list'

In [55]:
attribute_list.set_index('GeneSym', inplace=True)

In [56]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  18511 Out of 18511   

 The number of statisticaly relevent gene-attribute associations is: 1125042
