 # Gene Ontology (GO)

 Author: Moshe Silverstein <br/>
 Date: 03-2018 <br/>
 Data Source: http://geneontology.org/docs/downloads/

 Reviewer: Charles Dai <br>
 Updated: 6-20

In [1]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [2]:
import sys
import os

import numpy as np
import pandas as pd
import itertools
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [3]:
# from clustergrammer_widget import *
# net = Network(clustergrammer_widget)

In [4]:
%load_ext autoreload
%autoreload 2

 ### Python Version

In [5]:
sys.version

'3.8.0 (default, Oct 28 2019, 16:14:01) \n[GCC 8.3.0]'

 # Initialization

 ### Options

In [6]:
%%appyter code_eval

{% set attribute = ChoiceField(
    name='attribute',
    label='Attribute',
    choices={
        'Biological Process': "'Process'",
        'Cellular Component': "'Component'",
        'Molecular Function': "'Function'"
    },
    default='Biological Process',
    section='data'
) %}

 ### Load Mapping Dictionaries

In [7]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

Gathering sources: 100%|██████████| 3/3 [00:09<00:00,  3.02s/it]


 ### Output Path

In [8]:
%%appyter code_exec

output_name = 'go_' + {{attribute}}.lower()

path = 'Output/GO-' + {{attribute}}
if not os.path.exists(path):
    os.makedirs(path)

```python
output_name = 'go_' + 'Process'.lower()
path = 'Output/GO-' + 'Process'
if not os.path.exists(path):
    os.makedirs(path)
```

In [9]:
%%appyter hide_code
{% do SectionField(
    name='data',
    title='Load Data',
    subtitle='Upload Files from the Gene Ontology'
) %}

 # Load Data

In [10]:
%%appyter code_exec

df = pd.read_csv({{FileField(
    constraint='.*\.gaf$',
    name='gaf', 
    label='Gene Assocation File (gaf)', 
    default='Input/GO/goa_human.gaf',
    section='data')
}}, skiprows=31, header=None, usecols=[2, 3, 4, 6, 8], sep='\t',
    names=['DB Object Symbol', 'Qualifier', 'GO ID', 'Evidence Code', 'Aspect'])

```python
df = pd.read_csv('Input/GO/goa_human.gaf', skiprows=31, header=None, usecols=[2, 3, 4, 6, 8], sep='\t',
    names=['DB Object Symbol', 'Qualifier', 'GO ID', 'Evidence Code', 'Aspect'])
```

In [11]:
df.head()

Unnamed: 0,DB Object Symbol,Qualifier,GO ID,Evidence Code,Aspect
0,NUDT4B,,GO:0003723,IEA,F
1,NUDT4B,,GO:0005829,IDA,C
2,NUDT4B,,GO:0008486,IEA,F
3,NUDT4B,,GO:0046872,IEA,F
4,NUDT4B,,GO:0052840,IEA,F


In [12]:
df.shape

(502842, 5)

 ## Load Ontology

In [13]:
%%appyter code_exec

tree = ET.parse({{FileField(
    constraint='.*\.owl$',
    name='ontology', 
    label='Ontology (owl)', 
    default='Input/GO/go.owl',
    section='data')
}})
root = tree.getroot()

```python
tree = ET.parse('Input/GO/go.owl')
root = tree.getroot()
```

 # Pre-process Data

 ## Get Relevant Data

In [14]:
%%appyter code_exec

# Get only desired attribute:
# P for Biological Process, C for Cellular Component, F for Molecular Function
df = df[df['Aspect'] == {{attribute}}[0]]
# Drop data inferred from electronic annotation
df = df[df['Evidence Code'] != 'IEA']
# Drop NOT in qualifier
df = df[df['Qualifier'] != 'NOT']

```python
# Get only desired attribute:
# P for Biological Process, C for Cellular Component, F for Molecular Function
df = df[df['Aspect'] == 'Process'[0]]
# Drop data inferred from electronic annotation
df = df[df['Evidence Code'] != 'IEA']
# Drop NOT in qualifier
df = df[df['Qualifier'] != 'NOT']
```

In [15]:
df.head()

Unnamed: 0,DB Object Symbol,Qualifier,GO ID,Evidence Code,Aspect
115,IGKV2-28,,GO:0006898,TAS,P
116,IGKV2-28,,GO:0006956,TAS,P
117,IGKV2-28,,GO:0006958,TAS,P
119,IGKV2-28,,GO:0030449,TAS,P
120,IGKV2-28,,GO:0038095,TAS,P


In [16]:
df = df[['DB Object Symbol', 'GO ID']]
df.shape

(122255, 2)

 ## Build GO Ontology Map

In [17]:
ns = {
    'owl': 'http://www.w3.org/2002/07/owl#',
    'obo': 'http://www.geneontology.org/formats/oboInOwl#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'
}

In [18]:
mapping = {'GO ID': [], 'Label': []}

for child in root.findall('owl:Class', ns):
    id = child.find('obo:id', ns)
    label = child.find('rdfs:label', ns)
    if id is not None and label is not None:
        mapping['GO ID'].append(id.text)
        mapping['Label'].append(label.text)

onto_meta = pd.DataFrame(mapping).set_index('GO ID')
onto_meta.head()

Unnamed: 0_level_0,Label
GO ID,Unnamed: 1_level_1
GO:0000001,mitochondrion inheritance
GO:0000002,mitochondrial genome maintenance
GO:0000003,reproduction
GO:0000005,obsolete ribosomal chaperone activity
GO:0000006,high-affinity zinc transmembrane transporter a...


 ## Map GO IDs

In [19]:
df['GO ID'] = onto_meta['Label'].reindex(df['GO ID']).reset_index(drop=True)
df = df.set_index('DB Object Symbol')
df.head()

Unnamed: 0_level_0,GO ID
DB Object Symbol,Unnamed: 1_level_1
IGKV2-28,positive regulation of protein phosphorylation
IGKV2-28,positive regulation of autophagy
IGKV2-28,positive regulation of peptidyl-threonine phos...
IGKV2-28,positive regulation of peptidyl-serine phospho...
IGKV2-28,defense response to bacterium


 # Filter Data

 ## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [20]:
df = uf.mapgenesymbols(df, symbol_lookup)
df.shape

100%|██████████| 122255/122255 [00:00<00:00, 631781.40it/s]


(121803, 1)

 # Analyze Data

 ## Create Binary Matrix

In [None]:
binary_matrix = uf.createBinaryMatrix(df)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.saveData(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

 ## Create Gene List

In [None]:
gene_list = uf.createGeneList(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.saveData(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

 ## Create Attribute List

In [None]:
attribute_list = uf.createAttributeList(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.saveData(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

 ## Create Gene and Attribute Set Libraries

In [None]:
uf.createUpGeneSetLib(binary_matrix, path, output_name + '_gene_up_set')

In [None]:
uf.createUpAttributeSetLib(binary_matrix, path, 
                           output_name + '_attribute_up_set')

 ## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.saveData(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

In [None]:
# net.load_df(attribute_similarity_matrix.iloc[:,:].copy())
# net.filter_N_top('row', rank_type='sum', N_top=300)
# net.cluster()
# net.widget()

 ## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.createSimilarityMatrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.saveData(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

 ## Create Gene-Attribute Edge List

In [None]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, 
                               path, output_name + '_gene_attribute_edge_list')

 # Create Downloadable Save File

In [None]:
uf.createArchive(path)

 ### Link to download output files: [click here](./output_archive.zip)