 # GeneRIF

 Author: Moshe Silverstein <br/>
 Date: 8-17 <br/>
 Data Source: ftp://ftp.ncbi.nih.gov/gene/GeneRIF/

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [4]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [5]:
import sys
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [6]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [7]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [8]:
sys.version

'3.6.9 (default, Apr 18 2020, 01:56:04) \n[GCC 8.4.0]'

 # Initialization

 %% [markdown]

 ### Load Mapping Dictionaries

In [9]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:07<00:00,  2.46s/it]


 ### Output Path

In [10]:
output_name = 'generif'

path = 'Output/GeneRIF'
if not os.path.exists(path):
    os.makedirs(path)

In [11]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the GeneRIF Data Set',
) %}

 # Load Data

In [78]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.gz$',
    name='interactions', 
    label='Interactions (gz)', 
    default='Input/GeneRIF/interactions.gz',
    section='data')
}}, sep='\t', usecols=['#tax_id', 'gene_id', 'keyphrase', 'name.1'])

```python

df = pd.read_csv('Input/GeneRIF/interactions.gz', sep='\t', usecols=['#tax_id', 'gene_id', 'keyphrase', 'name.1'])
```

In [79]:
df.head()

Unnamed: 0,#tax_id,gene_id,keyphrase,name.1
0,358,1224321,-,"Agrobacterium tumefaciens str. C58 Ti plasmid,..."
1,358,1224322,-,"Agrobacterium tumefaciens str. C58 Ti plasmid,..."
2,358,1224323,-,"Agrobacterium tumefaciens str. C58 Ti plasmid,..."
3,358,1224324,-,hypothetical protein pTi_135
4,358,1224324,-,hypothetical protein pTi_135


In [80]:
df.shape

(2984183, 4)

 # Pre-process Data

 ## Get Relevant Data

In [81]:
# Only relevant species
df = df[np.logical_or.reduce([
    df['#tax_id'] == 9606, # human
    df['#tax_id'] == 10090, # mouse
    df['#tax_id'] == 10116 # rat
])].drop('#tax_id', axis=1)

In [82]:
# Concatenate full attribute
df = df[df['keyphrase'] != '-']
df['keyphrase'] = df['keyphrase'] + ' ' + df['name.1']
df = df.drop('name.1', axis=1).set_index('gene_id')
df.head()

Unnamed: 0_level_0,keyphrase
gene_id,Unnamed: 1_level_1
2,inhibits Tat
2,downregulates Envelope transmembrane glycoprot...
2,cleaves retropepsin
12,upregulates Tat
12,upregulates Envelope surface glycoprotein gp120


 ## Map Gene ID to Symbol

In [83]:
id_to_symbol = {v: k for k, v in geneid_lookup.items()}
df.index = df.index.map(id_to_symbol)
df.head()

Unnamed: 0_level_0,keyphrase
gene_id,Unnamed: 1_level_1
A2M,inhibits Tat
A2M,downregulates Envelope transmembrane glycoprot...
A2M,cleaves retropepsin
SERPINA3,upregulates Tat
SERPINA3,upregulates Envelope surface glycoprotein gp120


 # Filter Data

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [84]:
df = uf.mapgenesymbols(df, symbol_lookup)
df.shape

100%|██████████| 17772/17772 [00:00<00:00, 532170.85it/s]


(17764, 1)

 # Analyze Data

 ## Create Binary Matrix

In [85]:
binary_matrix = uf.createBinaryMatrix(df)
binary_matrix.head()

Unnamed: 0,Binds capsid,DOES NOT bind mutant capsid,DOES NOT bind mutant nucleocapsid,Inhibits Nef,abrogated by Vpr,abrogates Vpr,acetylated by Tat,acetylated by integrase,acetylates Tat,activated by Envelope surface glycoprotein gp120,"activated by Human immunodeficiency virus 1, complete genome",activated by Tat,activated by Vpr,activated by capsid,activated by integrase,activated by retropepsin,activates Envelope surface glycoprotein gp120,"activates Envelope surface glycoprotein gp160, precursor",activates Envelope transmembrane glycoprotein gp41,"activates Human immunodeficiency virus 1, complete genome",activates Nef,activates Pr55(Gag),activates Rev,activates Tat,activates Vpr,activates Vpu,activates matrix,activates nucleocapsid,activates retropepsin,activates reverse transcriptase,"affected by Human immunodeficiency virus 1, complete genome",affected by Nef,affected by Pr55(Gag),affected by Tat,affects Pr55(Gag),affects Vpr,affects capsid,antagonizes Envelope transmembrane glycoprotein gp41,antagonizes Nef,antagonizes Vif,...,suppresses Vpu,synergizes with Nef,synergizes with Rev,synergizes with Tat,targets Vpr,ubiquitinated by Pr55(Gag),ubiquitinated by Rev,ubiquitinated by Tat,ubiquitinated by capsid,ubiquitinated by matrix,ubiquitinated by nucleocapsid,ubiquitinated by p1,ubiquitinated by p6,ubiquitinates Vpu,upregulated by Asp,upregulated by Envelope surface glycoprotein gp120,"upregulated by Envelope surface glycoprotein gp160, precursor",upregulated by Nef,upregulated by Pr55(Gag),upregulated by Rev,upregulated by Tat,upregulated by capsid,upregulates Asp,upregulates Envelope surface glycoprotein gp120,"upregulates Envelope surface glycoprotein gp160, precursor",upregulates Envelope transmembrane glycoprotein gp41,upregulates Gag-Pol,"upregulates Human immunodeficiency virus 1, complete genome",upregulates Nef,upregulates Pr55(Gag),upregulates Tat,upregulates Vif,upregulates Vpr,upregulates Vpu,upregulates capsid,upregulates integrase,upregulates matrix,upregulates nucleocapsid,upregulates reverse transcriptase,utilizes Envelope surface glycoprotein gp120
A2M,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
A4GALT,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
AARSD1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
AASS,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
AATK,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [86]:
binary_matrix.shape

(4457, 661)

In [87]:
uf.saveData(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

 ## Create Gene List

In [88]:
gene_list = uf.createGeneList(binary_matrix, geneid_lookup)
gene_list.head()

100%|██████████| 4457/4457 [00:00<00:00, 227989.67it/s]


Unnamed: 0,GeneSym,GeneID
0,A2M,2
1,A4GALT,53947
2,AARSD1,80755
3,AASS,10157
4,AATK,9625


In [89]:
gene_list.shape

(4457, 2)

In [90]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [91]:
attribute_list = uf.createAttributeList(binary_matrix)
attribute_list.head()

Binds capsid
DOES NOT bind mutant capsid
DOES NOT bind mutant nucleocapsid
Inhibits Nef
abrogated by Vpr


In [92]:
attribute_list.shape

(661, 0)

In [93]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [94]:
uf.createUpGeneSetLib(binary_matrix, path, output_name + '_gene_up_set')

100%|██████████| 264/264 [00:00<00:00, 12892.03it/s]


In [95]:
uf.createUpAttributeSetLib(binary_matrix, path, 
                           output_name + '_attribute_up_set')

100%|██████████| 4457/4457 [00:00<00:00, 22384.37it/s]


 ## Create Attribute Similarity Matrix

In [96]:
attribute_similarity_matrix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

Unnamed: 0,Binds capsid,DOES NOT bind mutant capsid,DOES NOT bind mutant nucleocapsid,Inhibits Nef,abrogated by Vpr,abrogates Vpr,acetylated by Tat,acetylated by integrase,acetylates Tat,activated by Envelope surface glycoprotein gp120,"activated by Human immunodeficiency virus 1, complete genome",activated by Tat,activated by Vpr,activated by capsid,activated by integrase,activated by retropepsin,activates Envelope surface glycoprotein gp120,"activates Envelope surface glycoprotein gp160, precursor",activates Envelope transmembrane glycoprotein gp41,"activates Human immunodeficiency virus 1, complete genome",activates Nef,activates Pr55(Gag),activates Rev,activates Tat,activates Vpr,activates Vpu,activates matrix,activates nucleocapsid,activates retropepsin,activates reverse transcriptase,"affected by Human immunodeficiency virus 1, complete genome",affected by Nef,affected by Pr55(Gag),affected by Tat,affects Pr55(Gag),affects Vpr,affects capsid,antagonizes Envelope transmembrane glycoprotein gp41,antagonizes Nef,antagonizes Vif,...,suppresses Vpu,synergizes with Nef,synergizes with Rev,synergizes with Tat,targets Vpr,ubiquitinated by Pr55(Gag),ubiquitinated by Rev,ubiquitinated by Tat,ubiquitinated by capsid,ubiquitinated by matrix,ubiquitinated by nucleocapsid,ubiquitinated by p1,ubiquitinated by p6,ubiquitinates Vpu,upregulated by Asp,upregulated by Envelope surface glycoprotein gp120,"upregulated by Envelope surface glycoprotein gp160, precursor",upregulated by Nef,upregulated by Pr55(Gag),upregulated by Rev,upregulated by Tat,upregulated by capsid,upregulates Asp,upregulates Envelope surface glycoprotein gp120,"upregulates Envelope surface glycoprotein gp160, precursor",upregulates Envelope transmembrane glycoprotein gp41,upregulates Gag-Pol,"upregulates Human immunodeficiency virus 1, complete genome",upregulates Nef,upregulates Pr55(Gag),upregulates Tat,upregulates Vif,upregulates Vpr,upregulates Vpu,upregulates capsid,upregulates integrase,upregulates matrix,upregulates nucleocapsid,upregulates reverse transcriptase,utilizes Envelope surface glycoprotein gp120
Binds capsid,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003106,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.00495,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOES NOT bind mutant capsid,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003106,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.00495,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOES NOT bind mutant nucleocapsid,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003106,0.0,0.0,0.0,0.021277,0.0,0.0,0.0,0.0,0.00495,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Inhibits Nef,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016949,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
abrogated by Vpr,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.071429,0.0,0.0,0.0,0.0,0.0,0.0,0.003106,0.0,0.010101,0.0,0.0,0.0,0.021277,0.001701,0.0,0.00495,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [97]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [98]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [99]:
gene_similarity_matrix = uf.createSimilarityMatrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

Unnamed: 0,A2M,A4GALT,AARSD1,AASS,AATK,ABCA1,ABCA5,ABCB1,ABCB4,ABCC1,ABCD3,ABCE1,ABCF1,ABCG2,ABHD16A,ABHD16B,ABL1,ABO,ABR,ABTB1,ABTB2,ACAA2,ACACA,ACACB,ACADSB,ACAN,ACAP1,ACBD3,ACBD5,ACE,ACHE,ACIN1,ACKR2,ACKR4,ACLY,ACOD1,ACOT8,ACP2,ACP5,ACSL3,...,ZNF202,ZNF205,ZNF212,ZNF254,ZNF26,ZNF277,ZNF292,ZNF333,ZNF354A,ZNF395,ZNF430,ZNF436,ZNF480,ZNF512,ZNF512B,ZNF516,ZNF518A,ZNF536,ZNF552,ZNF556,ZNF587,ZNF594,ZNF598,ZNF639,ZNF658,ZNF687,ZNF688,ZNF701,ZNF720,ZNF747,ZNF761,ZNF785,ZNF791,ZNF831,ZNF93,ZNRD2,ZNRF2,ZSCAN16,ZSWIM4,ZYX
A2M,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.333333,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A4GALT,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.5,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.5
AARSD1,0.0,0.0,1.0,0.0,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5
AASS,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.25,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.5,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.5
AATK,0.0,0.0,1.0,0.0,1.0,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5


In [100]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [101]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

The number of statisticaly relevent gene-attribute associations is: 13374


 # Create Downloadable Save File

In [102]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)