# hu.MAP

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 5-30-2017 <br/>
Data Source: 

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/hu.MAP/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Protein Interaction Network with probability scores.txt', sep='\t', header=None)

In [4]:
df.head()

Unnamed: 0,0,1,2
0,1627,71,0.893189
1,1965,10969,0.088725
2,3192,3326,0.198868
3,64599,7699,0.171414
4,28755,102157402,0.0


In [5]:
df.shape

(64048, 3)

# Get Only Protein-Protein Interactions

### Load mapping File

In [6]:
mapping_df = pd.read_csv('SymbolToId.tsv', sep='\t')

In [7]:
mapping_df.head()

Unnamed: 0,Approved Symbol,Entrez Gene ID,Ensembl Gene ID
0,A1BG,1.0,ENSG00000121410
1,A1BG-AS1,503538.0,ENSG00000268895
2,A1CF,29974.0,ENSG00000148584
3,A1S9T~withdrawn,,
4,A2M,2.0,ENSG00000175899


In [8]:
mapping_df.set_index('Entrez Gene ID', inplace=True)

### Map Gene Symbols to IDs 

In [9]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    if 'E' in df.ix[index, 0]:
        lst.append(np.nan)
    elif float(df.ix[index, 0]) in mapping_df.index:
        lst.append(mapping_df.ix[float(df.ix[index, 0]), 'Approved Symbol'])
    else:
        lst.append(np.nan)

df[0] = lst

lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    if 'E' in df.ix[index, 1]:
        lst.append(np.nan)
    elif float(df.ix[index, 1]) in mapping_df.index:
        lst.append(mapping_df.ix[float(df.ix[index, 1]), 'Approved Symbol'])
    else:
        lst.append(np.nan)

df[1] = lst

Progeres: 100%  64048 Out of 64048   

In [11]:
df.head()

Unnamed: 0,0,1,2
0,DBN1,ACTG1,0.893189
1,EIF2S1,EBNA1BP2,0.088725
2,HNRNPU,HSP90AB1,0.198868
3,GIGYF1,ZNF140,0.171414
4,TRAC,AK6,0.0


In [13]:
df_ppi = df[[0,1]].copy()

In [14]:
df_ppi.head()

Unnamed: 0,0,1
0,DBN1,ACTG1
1,EIF2S1,EBNA1BP2
2,HNRNPU,HSP90AB1
3,GIGYF1,ZNF140
4,TRAC,AK6


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [16]:
df_ppi.set_index(0, inplace=True)
uf.mapgenesymbols(df_ppi)

Progeres: 99%  63830 Out of 64048   

In [17]:
df_ppi.reset_index(inplace=True)

In [18]:
df_ppi.set_index(1, inplace=True)
uf.mapgenesymbols(df_ppi)

Progeres: 100%  64019 Out of 64019   

In [19]:
df_ppi.reset_index(inplace=True)

In [20]:
df_ppi.shape

(64005, 2)

# Create Binary Matrix

In [21]:
binary_matrix = uf.createBinaryMatix(df_ppi)

Progeres: 100%  7669 Out of 7669   

In [22]:
binary_matrix.head()

Unnamed: 0,GNG2,PHYKPL,NUDCD2,PRPF31,HSP90AA5P,RPL23A,PRSS50,ASCC2,CYLD,RSRP1,...,CCDC126,THAP3,IK,CST7,DNASE1L2,PTK2B,FAM90A5P,PKN1,GALNS,XRCC1
GNG2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PHYKPL,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NUDCD2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PRPF31,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HSP90AA5P,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Save Binary Matrix

In [24]:
filename = '~/./Documents/Harmonizome/hu.MAP/Output/hu.map_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [28]:
path = '/Users/moshesilverstein/Documents/Harmonizome/hu.MAP/Output/'

In [29]:
name = 'hu.MAP_gene_set'

In [30]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  7669 Out of 7669   

# Create Gene Similarity Matrix

In [31]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [32]:
gene_similarity_matix.head()

Unnamed: 0,GNG2,PHYKPL,NUDCD2,PRPF31,HSP90AA5P,RPL23A,PRSS50,ASCC2,CYLD,RSRP1,...,CCDC126,THAP3,IK,CST7,DNASE1L2,PTK2B,FAM90A5P,PKN1,GALNS,XRCC1
GNG2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.016393,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PHYKPL,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NUDCD2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRPF31,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.227273,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HSP90AA5P,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix 

In [33]:
filename = '~/./Documents/Harmonizome/hu.MAP/Output/hu.map_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [34]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  7669 Out of 7669   

In [35]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,GNG2,54331
1,PHYKPL,85007
2,NUDCD2,134492
3,PRPF31,26121
4,HSP90AA5P,730211


In [36]:
gene_list.shape

(7669, 2)

### Save Gene List

In [37]:
filename = '~/./Documents/Harmonizome/hu.MAP/Output/hu.ma[_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [38]:
gene_attribute_edge_list = uf.createGeneAttributeEdgeList(binary_matrix, gene_list)

Progeres: 100%  7669 Out of 7669   

In [39]:
gene_attribute_edge_list.head()

Unnamed: 0,Attribute,Gene,GeneID,Weight
0,GNG2,GNG2,54331,0.0
1,GNG2,PHYKPL,85007,0.0
2,GNG2,NUDCD2,134492,0.0
3,GNG2,PRPF31,26121,0.0
4,GNG2,HSP90AA5P,730211,0.0


In [40]:
gene_attribute_edge_list.shape

(58813561, 4)

### Get Number of (Statistically Relevant) Gene-Attribute Associations

In [41]:
gene_attribute_edge_list[gene_attribute_edge_list['Weight'] != 0].shape

(126176, 4)

### Save Gene-Attribute Edge List

In [42]:
filename = '~/./Documents/Harmonizome/hu.MAP/Output/hu.map_gene_attribute_edge_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_attribute_edge_list.to_csv(filename, sep='\t', index=False, compression='gzip')