# GeneRIF

Author: Moshe Silverstein <br/>
Date: 8-17 <br/>
Data Source: ftp://ftp.ncbi.nih.gov/gene/GeneRIF/

In [1]:
import sys, datetime, os
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GeneRIF/untility_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('Input/interactions.gz', sep='\t', low_memory=False)

In [4]:
df.head()

Unnamed: 0,#tax_id,gene_id,accn.vers,name,keyphrase,tax_id,interactant_id,interactant_id_type,accn.vers.1,name.1,complex_id,complex_id_type,complex_name,pubmed_id_list,last_mod,generif_text,interaction_id,interaction_id_type
0,358,1224321,NP_059802.1,hypothetical protein pTi_130,-,-,-,-,AE008690.1,"Agrobacterium tumefaciens str. C58 Ti plasmid,...",-,-,-,15155952,2005-01-18 14:45,VirB4 interacts with T-DNA.,134324,BIND
1,358,1224322,NP_059803.1,hypothetical protein pTi_131,-,-,-,-,AE008690.1,"Agrobacterium tumefaciens str. C58 Ti plasmid,...",-,-,-,15155952,2005-01-18 14:45,VirB5 interacts with T-DNA.,134323,BIND
2,358,1224323,NP_059804.1,hypothetical protein pTi_132,-,-,-,-,AE008690.1,"Agrobacterium tumefaciens str. C58 Ti plasmid,...",-,-,-,15155952,2005-01-18 14:45,VirB6 interacts with T-DNA.,134329,BIND
3,358,1224324,NP_059805.1,hypothetical protein pTi_133,-,358,1224326,GeneID,NP_059807.1,hypothetical protein pTi_135,-,-,-,9171381,2005-05-16 15:52,VirB7 interacts with VirB9. This interaction w...,196356,BIND
4,358,1224324,NP_059805.1,hypothetical protein pTi_133,-,358,1224326,GeneID,NP_059807.1,hypothetical protein pTi_135,-,-,-,15155952,2005-01-18 14:45,VirB7 interacts with VirB9.,134325,BIND


In [5]:
df.shape

(2372536, 18)

# Get Relevent Data

In [6]:
human = df[df['#tax_id'] == 9606].copy()
mouse = df[df['#tax_id'] == 10090].copy()
rat = df[df['#tax_id'] == 10116].copy()

df = pd.concat([human, pd.concat([mouse, rat])])

In [7]:
df = df[['gene_id', 'keyphrase']]

In [10]:
df = df[df['keyphrase'] != '-']

In [11]:
df.shape

(16829, 2)

In [12]:
df.head()

Unnamed: 0,gene_id,keyphrase
201461,2,inhibits
201462,2,downregulates
201463,2,cleaves
201614,12,upregulates
201615,12,upregulates


# Load Gene Meta Data

In [18]:
gene_meta = pd.read_csv('Input/gene_meta.txt', sep='\t', index_col=1)

In [27]:
gene_meta.head()

Unnamed: 0_level_0,Approved Symbol
Entrez Gene ID(supplied by NCBI),Unnamed: 1_level_1
1.0,A1BG
503538.0,A1BG-AS1
29974.0,A1CF
2.0,A2M
144571.0,A2M-AS1


# Map Gene Id's to Symbols 

In [31]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progeres: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    if float(df.ix[index, 'gene_id']) in gene_meta.index:
        lst.append(gene_meta.ix[float(df.ix[index, 'gene_id']), 'Approved Symbol'])
    else:
        lst.append(np.nan)

df['gene_id'] = lst
df.dropna(how='any', inplace=True)

Progeres: 100%  16829 Out of 16829   

In [32]:
df.head()

Unnamed: 0,gene_id,keyphrase
201461,A2M,inhibits
201462,A2M,downregulates
201463,A2M,cleaves
201614,SERPINA3,upregulates
201615,SERPINA3,upregulates


In [33]:
df.shape

(16828, 2)

# Drop Duplicates

In [35]:
df.drop_duplicates(inplace=True)

In [36]:
df.shape

(9452, 2)

# Create Binary Matrix

In [38]:
binary_matrix = uf.createBinaryMatix(df)

Progeres: 100%  4368 Out of 4368   

In [39]:
binary_matrix.head()

Unnamed: 0,required by,inhibited by,sulfated by,cooperates with,induces ubiquitination of,upregulates,recruits,localizes with,relocalized by,decreases phosphorylation of,...,myristoylated by,incorporates,affects,imported by,enhances polymerization of,restricted by,blocks,processed by,cleavage induced by,packages
EZH2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CDK5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZNF701,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PRSS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PRG4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
binary_matrix.shape

(4368, 125)

# Save Binary Matrix

In [41]:
filename = '~/./Documents/Harmonizome/GeneRIF/Output/generif_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GeneRIF/Output/'

In [43]:
name = 'generif_gene_set'

In [44]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  125 Out of 125   

# Create Attribute Library

In [45]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GeneRIF/Output/'

In [46]:
name = 'generif_attribute_set'

In [47]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  4368 Out of 4368   

# Create Gene Similarity Matrix

In [48]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [49]:
gene_similarity_matix.head()

Unnamed: 0,EZH2,CDK5,ZNF701,PRSS1,PRG4,TIMM8A,SEPSECS,KRTCAP3,ERCC3,WDR5,...,FUT1,GLUD2,ANXA1,RARA,MYO1G,GNS,MS4A6A,USP26,CCND2,MAVS
EZH2,1.0,0.0,1.0,0.0,0.0,0.5,0.0,1.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
CDK5,0.0,1.0,0.0,0.0,0.0,0.166667,0.2,0.0,0.090909,0.2,...,0.0,0.2,0.2,0.0,0.2,0.0,0.2,0.0,0.0,0.0
ZNF701,1.0,0.0,1.0,0.0,0.0,0.5,0.0,1.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
PRSS1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PRG4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.142857,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix

In [50]:
filename = '~/./Documents/Harmonizome/GeneRIF/Output/generif_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [51]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [52]:
attribute_similarity_matix.head()

Unnamed: 0,required by,inhibited by,sulfated by,cooperates with,induces ubiquitination of,upregulates,recruits,localizes with,relocalized by,decreases phosphorylation of,...,myristoylated by,incorporates,affects,imported by,enhances polymerization of,restricted by,blocks,processed by,cleavage induced by,packages
required by,1.0,0.00207,0.0,0.024194,0.0,0.000939,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
inhibited by,0.00207,1.0,0.0,0.056042,0.010373,0.113357,0.032381,0.0,0.006148,0.010081,...,0.0,0.044326,0.006211,0.010121,0.0,0.0,0.004158,0.003676,0.002083,0.002083
sulfated by,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cooperates with,0.024194,0.056042,0.0,1.0,0.007752,0.05427,0.063218,0.0,0.0,0.035971,...,0.0,0.008696,0.007812,0.007092,0.0,0.0,0.0,0.0,0.00813,0.0
induces ubiquitination of,0.0,0.010373,0.0,0.007752,1.0,0.001874,0.014706,0.0,0.0,0.0,...,0.0,0.026549,0.083333,0.0,0.0,0.0,0.0,0.0,0.0,0.142857


# Save Attribute Similarity Matrix

In [53]:
filename = '~/./Documents/Harmonizome/GeneRIF/Output/generif_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [61]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  4368 Out of 4368   

In [62]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,EZH2,2146
1,CDK5,1020
2,ZNF701,55762
3,PRSS1,5644
4,PRG4,10216


In [63]:
gene_list.shape

(4368, 2)

# Save Gene List

In [64]:
filename = '~/./Documents/Harmonizome/GeneRIF/Output/generif_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List

In [65]:
attribute_list = uf.createAttributeList(binary_matrix)

In [66]:
attribute_list.head()

Unnamed: 0,Attributes
0,required by
1,inhibited by
2,sulfated by
3,cooperates with
4,induces ubiquitination of


In [67]:
attribute_list.shape

(125, 1)

# Save Attribute List

In [68]:
filename = '~/./Documents/Harmonizome/GeneRIF/Output/generif_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [69]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GeneRIF/Output/'

In [70]:
name = 'generif_gene_attribute_edge_list'

In [71]:
uf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  125 Out of 125   

 The number of statisticaly relevent gene-attribute associations is: 9452
