# Harmonizome ETL: Gene Ontology (GO)

Created by: Charles Dai <br>
Credit to: Moshe Silverstein

Data Source: http://geneontology.org/docs/download-ontology/

In [None]:
# appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import sys
import os
from datetime import date

import numpy as np
import pandas as pd
import itertools
import xml.etree.ElementTree as ET
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

### Notebook Information

In [None]:
print('This notebook was run on:', date.today(), '\nPython version:', sys.version)

# Initialization

In [None]:
%%appyter hide_code

{% do SectionField(
    name='data',
    title='Upload Data',
    img='load_icon.png'
) %}

{% do SectionField(
    name='settings',
    title='Settings',
    img='setting_icon.png'
) %}

In [None]:
%%appyter code_eval

{% do DescriptionField(
    name='description',
    text='The examples below were sourced from <a href="http://geneontology.org/" target="_blank">geneontology.org</a>. If clicking on the examples does not work, they should be downloaded directly from the source website.',
    section='data'
) %}

{% set df_file = FileField(
    constraint='.*\.gaf.gz$',
    name='gaf', 
    label='Gene Assocation File (gaf.gz)', 
    default='goa_human.gaf.gz',
    examples={
        'goa_human.gaf.gz': 'http://geneontology.org/gene-associations/goa_human.gaf.gz'
    },
    section='data'
) %}

{% set ontology = FileField(
    constraint='.*\.owl$',
    name='ontology', 
    label='Ontology (owl)', 
    default='go.owl',
    examples={
        'go.owl': 'http://purl.obolibrary.org/obo/go.owl'
    },
    section='data'
) %}

In [None]:
%%appyter code_eval

{% set attribute = ChoiceField(
    name='attribute',
    label='Attribute',
    choices={
        'Biological Process': 'Process',
        'Cellular Component': 'Component',
        'Molecular Function': 'Function'
    },
    default='Cellular Component',
    section='settings'
) %}

### Load Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

### Output Path

In [None]:
%%appyter code_exec

output_name = 'go_{{attribute}}'.lower()

path = 'Output/GO-{{attribute}}' 
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

df = pd.read_csv(
    {{df_file}}, 
    skiprows=31, header=None, usecols=[2, 3, 4, 6, 8], sep='\t',
    names=['DB Object Symbol', 'Qualifier', 'GO ID', 'Evidence Code', 'Aspect'])

In [None]:
df.head()

In [None]:
df.shape

## Load Ontology

In [None]:
%%appyter code_exec

tree = ET.parse({{ontology}})
root = tree.getroot()

# Pre-process Data

## Get Relevant Data

In [None]:
%%appyter code_exec

# Get only desired attribute:
# P for Biological Process, C for Cellular Component, F for Molecular Function
df = df[df['Aspect'] == '{{attribute}}'[0]]
# Drop data inferred from electronic annotation
df = df[df['Evidence Code'] != 'IEA']
# Drop NOT in qualifier
df = df[df['Qualifier'] != 'NOT']

In [None]:
df.shape

In [None]:
df = df[['DB Object Symbol', 'GO ID']]
df.head()

## Build GO Ontology Map

In [None]:
ns = {
    'owl': 'http://www.w3.org/2002/07/owl#',
    'obo': 'http://www.geneontology.org/formats/oboInOwl#',
    'rdfs': 'http://www.w3.org/2000/01/rdf-schema#'
}

In [None]:
mapping = {'GO ID': [], 'Label': []}

for child in root.findall('owl:Class', ns):
    id = child.find('obo:id', ns)
    label = child.find('rdfs:label', ns)
    if id is not None and label is not None:
        mapping['GO ID'].append(id.text)
        mapping['Label'].append(label.text)

onto_meta = pd.DataFrame(mapping).set_index('GO ID')
onto_meta.head()

## Map GO IDs

In [None]:
df['GO ID'] = onto_meta['Label'].reindex(df['GO ID']).reset_index(drop=True)
df = df.set_index('DB Object Symbol')
df.index.name = 'Gene Symbol'
df.columns = ['GO Term']
df.head()

# Filter Data

## Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
df.shape

# Analyze Data

## Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix', 
            compression='npz', dtype=np.uint8)

## Create Gene List

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
            ext='tsv', compression='gzip', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
            ext='tsv', compression='gzip')

## Create Gene and Attribute Set Libraries

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, 
                           output_name + '_attribute_up_set')

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path,
            output_name + '_attribute_similarity_matrix', 
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
            output_name + '_gene_similarity_matrix',
            compression='npz', symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list', 
        ext='tsv', compression='gzip')

# Create Downloadable Save File

In [None]:
uf.archive(path)

### Link to download output files: [click here](./output_archive.zip)