# Wikipathways

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Source: http://www.wikipathways.org/index.php/WikiPathways

In [85]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [86]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/wikipathways/untility_functions.py'>

# Load Data 

In [87]:
df = pd.read_csv('wikipathways-20170110-gmt-Homo_sapiens.gmt', sep='%', header=None)

In [88]:
df.head()

Unnamed: 0,0,1,2,3
0,Tryptophan metabolism,WikiPathways_20170110,WP465,Homo sapiens\thttp://www.wikipathways.org/inst...
1,Steroid Biosynthesis,WikiPathways_20170110,WP496,Homo sapiens\thttp://www.wikipathways.org/inst...
2,Arylamine metabolism,WikiPathways_20170110,WP694,Homo sapiens\thttp://www.wikipathways.org/inst...
3,Regulation of Actin Cytoskeleton,WikiPathways_20170110,WP51,Homo sapiens\thttp://www.wikipathways.org/inst...
4,Interleukin-11 Signaling Pathway,WikiPathways_20170110,WP2332,Homo sapiens\thttp://www.wikipathways.org/inst...


In [89]:
df.shape

(377, 4)

# Get Gene Pathway Info

In [90]:
df_pathways = pd.DataFrame()

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    lst1 = df.ix[index, 3].split('\t')[3:]
    lst2 = [df.ix[index, 0]]*len(lst1)
    
    temp = pd.DataFrame(columns=['Gene', 'Pathway'])
    temp['Gene'] = lst1
    temp['Pathway'] = lst2
    
    df_pathways = pd.concat([df_pathways, temp])

Progress: 100%  377 Out of 377   

In [91]:
df_pathways.head()

Unnamed: 0,Gene,Pathway
0,1571,Tryptophan metabolism
1,4129,Tryptophan metabolism
2,216,Tryptophan metabolism
3,316,Tryptophan metabolism
4,217,Tryptophan metabolism


In [92]:
df_pathways = df_pathways.reset_index().drop('index', axis=1)

In [93]:
df_pathways.shape

(15595, 2)

# Load Gene Meta

In [94]:
gene_meta = pd.read_csv('gene_info', sep='\t', index_col=1)

In [95]:
gene_meta = gene_meta[['Symbol']]

In [96]:
gene_meta.head()

Unnamed: 0_level_0,Symbol
GeneID,Unnamed: 1_level_1
5692769,NEWENTRY
1246500,repA1
1246501,repA2
1246502,leuA
1246503,leuB


# Map Gene Symbol To Gene ID 

In [97]:
lst = []

for i,index in enumerate(df_pathways.index):
    
    progressPercent = ((i+1)/len(df_pathways.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df_pathways.index)))
    sys.stdout.flush()
    
    lst.append(gene_meta.ix[int(df_pathways.ix[index, 'Gene']), 'Symbol'])

    
df_pathways['Gene'] = lst

Progress: 100%  15595 Out of 15595   

In [98]:
df_pathways.head()

Unnamed: 0,Gene,Pathway
0,CYP2E1,Tryptophan metabolism
1,MAOB,Tryptophan metabolism
2,ALDH1A1,Tryptophan metabolism
3,AOX1,Tryptophan metabolism
4,ALDH2,Tryptophan metabolism


In [99]:
df_pathways.shape

(15595, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [100]:
df_pathways.set_index('Gene', inplace=True)

In [101]:
uf.mapgenesymbols(df_pathways)

Progeres: 100%  15595 Out of 15595   

In [102]:
df_pathways.shape

(15540, 1)

# Drop Duplicates

In [106]:
df_pathways.reset_index(inplace=True)

In [107]:
df_pathways.drop_duplicates(subset=['Gene', 'Pathway'], inplace=True)

In [108]:
df_pathways.shape

(15503, 2)

In [109]:
df_pathways.head()

Unnamed: 0,Gene,Pathway
0,CYP2E1,Tryptophan metabolism
1,MAOB,Tryptophan metabolism
2,ALDH1A1,Tryptophan metabolism
3,AOX1,Tryptophan metabolism
4,ALDH2,Tryptophan metabolism


# Create Binary Matrix

In [110]:
binary_matrix = uf.createBinaryMatix(df_pathways)

Progeres: 100%  5388 Out of 5388   

In [111]:
binary_matrix.head()

Unnamed: 0,Lidocaine metabolism,Hfe effect on hepcidin production,Serotonin Receptor 2 and STAT3 Signaling,Selenium Micronutrient Network,LncRNA-mediated mechanisms of therapeutic resistance,Brain-Derived Neurotrophic Factor (BDNF) signaling pathway,TarBasePathway,Cytoplasmic Ribosomal Proteins,Estrogen metabolism,IL1 and megakaryocytes in obesity,...,Retinoblastoma (RB) in Cancer,Tamoxifen metabolism,Ectoderm Differentiation,Type II interferon signaling (IFNG),Neural Crest Differentiation,Complement and Coagulation Cascades,TCA Cycle and Deficiency of Pyruvate Dehydrogenase complex (PDHc),Peptide GPCRs,Felbamate Metabolism,Focal Adhesion
FPR2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
MIR129-1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TMOD1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ULK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TNFSF9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
binary_matrix.shape

(5388, 372)

# Save Binary Matrix

In [113]:
filename = '~/./Documents/Harmonizome/wikipathways/Output/wikipathways_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [117]:
path = '/Users/moshesilverstein/Documents/Harmonizome/wikipathways/Output/'

In [118]:
name = 'wikipathways_gene_set'

In [119]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  372 Out of 372   

# Create Attribute Library

In [120]:
path = '/Users/moshesilverstein/Documents/Harmonizome/wikipathways/Output/'

In [121]:
name = 'wikipathways_attribute_set'

In [122]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  5388 Out of 5388   

# Create Gene Similarity Matrix

In [123]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [124]:
gene_similarity_matix.head()

Unnamed: 0,FPR2,MIR129-1,TMOD1,ULK1,TNFSF9,FRS3,NPC1L1,E2F4,CEBPA,CTR9,...,RNF20,VWF,PTGES,MIR132,RAMP2,MIR1307,CPLX2,RAI1,SLC2A4RG,SERPING1
FPR2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
MIR129-1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,1.0,0.0,0.0,0.0,0.0
TMOD1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ULK1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFSF9,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Gene Similarity Matrix 

In [125]:
filename = '~/./Documents/Harmonizome/wikipathways/Output/wikipathways_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [126]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [127]:
attribute_similarity_matix.head()

Unnamed: 0,Lidocaine metabolism,Hfe effect on hepcidin production,Serotonin Receptor 2 and STAT3 Signaling,Selenium Micronutrient Network,LncRNA-mediated mechanisms of therapeutic resistance,Brain-Derived Neurotrophic Factor (BDNF) signaling pathway,TarBasePathway,Cytoplasmic Ribosomal Proteins,Estrogen metabolism,IL1 and megakaryocytes in obesity,...,Retinoblastoma (RB) in Cancer,Tamoxifen metabolism,Ectoderm Differentiation,Type II interferon signaling (IFNG),Neural Crest Differentiation,Complement and Coagulation Cascades,TCA Cycle and Deficiency of Pyruvate Dehydrogenase complex (PDHc),Peptide GPCRs,Felbamate Metabolism,Focal Adhesion
Lidocaine metabolism,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.055556,0.0,...,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
Hfe effect on hepcidin production,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.009434,0.0,0.0,0.0,0.0,0.0
Serotonin Receptor 2 and STAT3 Signaling,0.0,0.0,1.0,0.0,0.0,0.013889,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.026316,0.0,0.0,0.0,0.0,0.0,0.0
Selenium Micronutrient Network,0.0,0.0,0.0,1.0,0.0,0.008811,0.0,0.0,0.0,0.057692,...,0.0,0.0,0.004405,0.02521,0.010811,0.028571,0.0,0.0,0.0,0.003584
LncRNA-mediated mechanisms of therapeutic resistance,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.020833,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Save Attribute Similarity Matrix

In [128]:
filename = '~/./Documents/Harmonizome/wikipathways/Output/wikipathways_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [129]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  5388 Out of 5388   

In [130]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,FPR2,2358
1,MIR129-1,406917
2,TMOD1,7111
3,ULK1,8408
4,TNFSF9,8744


In [131]:
gene_list.shape

(5388, 2)

## Save Gene List

In [132]:
filename = '~/./Documents/Harmonizome/wikipathways/Output/wikipathways_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [133]:
attribute_list = uf.createAttributeList(binary_matrix)

In [134]:
attribute_list.head()

Unnamed: 0,Attributes
0,Lidocaine metabolism
1,Hfe effect on hepcidin production
2,Serotonin Receptor 2 and STAT3 Signaling
3,Selenium Micronutrient Network
4,LncRNA-mediated mechanisms of therapeutic resi...


In [135]:
attribute_list.shape

(372, 1)

## Save Attribute List

In [136]:
filename = '~/./Documents/Harmonizome/wikipathways/Output/wikipathways_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [137]:
path = '/Users/moshesilverstein/Documents/Harmonizome/wikipathways/Output/'

In [138]:
name = 'wikipathways_gene_attribute_edge_list'

In [139]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  372 Out of 372   

 The number of statisticaly relevent gene-attribute associations is: 15503
