# BioPlex

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 5-30-2017 <br/>
Data Source: http://bioplex.hms.harvard.edu/downloadInteractions.php <br/>  http://bioplex.hms.harvard.edu/data/BioPlex_interactionList_v4a.tsv 

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
%matplotlib inline

In [45]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/BioPlex/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('~/./Documents/Harmonizome/BioPlex/input/BioPlex_interactionList_v4a.tsv', sep='\t')

In [4]:
df.head()

Unnamed: 0,GeneA,GeneB,UniprotA,UniprotB,SymbolA,SymbolB,p(Wrong),p(No Interaction),p(Interaction)
0,100,728378,P00813,A5A3E0,ADA,POTEF,2.380858e-09,0.000332,0.999668
1,100,345651,P00813,Q562R1,ADA,ACTBL2,9.786437e-18,0.211914,0.788086
2,222389,708,Q8N7W2,Q07021,BEND7,C1QBP,2.962215e-17,0.005645,0.994355
3,222389,4038,Q8N7W2,O75096,BEND7,LRP4,3.302994e-10,0.00028,0.99972
4,645121,3312,Q6ZMN8,P11142,CCNI2,HSPA8,2.060285e-16,0.036235,0.963765


In [5]:
df.shape

(56553, 9)

In [6]:
df_ppi = df[['SymbolA','SymbolB']].copy()

# Get Only Protein-Protein Interactions

In [7]:
df_ppi.head()

Unnamed: 0,SymbolA,SymbolB
0,ADA,POTEF
1,ADA,ACTBL2
2,BEND7,C1QBP
3,BEND7,LRP4
4,CCNI2,HSPA8


In [8]:
df_ppi.shape

(56553, 2)

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [9]:
df_ppi.set_index('SymbolA', inplace=True)
mf.mapgenesymbols(df_ppi)

Progeres: 100%  56553 Out of 56553   

In [10]:
df_ppi.reset_index(inplace=True)

In [11]:
df_ppi.set_index('SymbolB', inplace=True)
mf.mapgenesymbols(df_ppi)

Progeres: 100%  56348 Out of 56348   

In [12]:
df_ppi.reset_index(inplace=True)

In [13]:
df_ppi.shape

(55861, 2)

# Create Binary Matrix

In [37]:
binary_matrix = mf.createBinaryMatix(df_ppi)

Progeres: 100%  10824 Out of 10824   

In [40]:
binary_matrix.head()

Unnamed: 0,SLC25A6,IPO9,ICA1,FAM193B,REPS1,GSTO2,CRMP1,STAM,GUCY1B3,CSTF2T,...,TNNT3,TAF10,TMEM252,TEX29,SCAF11,HLA-DMB,MTR,GLUD1,PARD6G,MED6
SLC25A6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
IPO9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ICA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FAM193B,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
REPS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Save Binary Matrix

In [41]:
filename = '~/./Documents/Harmonizome/BioPlex/Output/bioplex_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/BioPlex/Output/'

In [43]:
name = 'bioplex_gene_set'

In [46]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  10824 Out of 10824   

# Create Gene Similarity Matrix

In [48]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [49]:
gene_similarity_matix.head()

Unnamed: 0,SLC25A6,IPO9,ICA1,FAM193B,REPS1,GSTO2,CRMP1,STAM,GUCY1B3,CSTF2T,...,TNNT3,TAF10,TMEM252,TEX29,SCAF11,HLA-DMB,MTR,GLUD1,PARD6G,MED6
SLC25A6,1.0,0.027027,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
IPO9,0.027027,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ICA1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
FAM193B,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
REPS1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.03125,...,0.0,0.0,0.0,0.032258,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix 

In [50]:
filename = '~/./Documents/Harmonizome/BioPlex/Output/bioplex_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [52]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  10824 Out of 10824   

In [53]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,SLC25A6,293
1,IPO9,55705
2,ICA1,3382
3,FAM193B,54540
4,REPS1,85021


In [54]:
gene_list.shape

(10824, 2)

### Save Gene List

In [55]:
filename = '~/./Documents/Harmonizome/BioPlex/Output/bioplex_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [56]:
gene_attribute_edge_list = mf.createGeneAttributeEdgeList(binary_matrix, gene_list)

Progeres: 100%  10824 Out of 10824   

In [57]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,SLC25A6,SLC25A6,293,0.0
1,SLC25A6,IPO9,55705,0.0
2,SLC25A6,ICA1,3382,0.0
3,SLC25A6,FAM193B,54540,0.0
4,SLC25A6,REPS1,85021,0.0


In [58]:
gene_attribute_edge_list.shape

(117158976, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [59]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(111694, 4)

### Save Gene-Attribute Edge List

In [60]:
filename = '~/./Documents/Harmonizome/BioPlex/Output/bioplex_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')