# Gene Ontology (GO) Biological Process

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://geneontology.org/gene-associations/goa_human.gaf.gz

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GO/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('input/goa_human.gaf', sep='\t', skiprows=34, header=None, index_col=False)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


In [5]:
df.shape

(417076, 17)

# Get Relevent Columns and Data

In [6]:
columns = ["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence Code", 
               "With (or) From", "Aspect", "DB Object Name", "DB Object Synonym", "DB Object Type", "Taxon", 
               "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID"]

In [7]:
df.columns = columns

In [8]:
df.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With (or) From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


#### Get Only Biological Process Data

In [9]:
df = df[df['Aspect']=='P'].copy()

#### Drop All Data That Is Inferred from Electronic Annotation

In [10]:
df = df[~df['Evidence Code'].isin(['IEA'])]

#### Drop All 'Non' in Qualifier 

In [11]:
df = df[~df['Qualifier'].isin(['NOT'])]

In [12]:
df = df[['DB Object Symbol', 'GO ID']]

In [13]:
df.reset_index(inplace=True)

In [14]:
df.drop('index', axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV3-7,GO:0002377
1,IGKV3-7,GO:0006955
2,IGKV1D-42,GO:0002377
3,IGKV1D-42,GO:0006955
4,IGLV4-69,GO:0002377


In [16]:
df.shape

(94691, 2)

# Load Gene Ontology Tree Digraph

In [17]:
digraph = goenrich.obo.ontology('input/go-basic.ob')

# Keep Only Terms With a Tree Depth of 4 or Greater

In [18]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.ix[index, 'GO ID']
    if term in digraph.node:
        if digraph.node[term]['depth'] >= 4:
            lst.append(term)
        else:
            lst.append(np.nan)
    else:
        lst.append(np.nan)

df['GO ID'] = lst

Progress: 100%  94691 Out of 94691   

In [19]:
df.shape

(94691, 2)

In [20]:
df.dropna(inplace=True)

In [21]:
df.shape

(79686, 2)

In [22]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
39,IGKV2-28,GO:0006898
42,IGKV2-28,GO:0006958
43,IGKV2-28,GO:0030449
44,IGKV2-28,GO:0038095
45,IGKV2-28,GO:0038096


# Term Propagation-propergate child gene term relationships to parent terms

In [23]:
lst1 = []
lst2 = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.ix[index, 'GO ID']
    for parent in digraph.predecessors(term):
        if parent in digraph.node:
            if digraph.node[parent]['depth'] >= 4:
                lst1.append(df.ix[index, 'DB Object Symbol'])
                lst2.append(parent)


temp = pd.DataFrame()
temp['DB Object Symbol'] = lst1
temp['GO ID']  = lst2
df = pd.concat([df, temp])
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

Progress: 100%  79686 Out of 79686   

In [24]:
df.shape

(618023, 2)

# Map GO ID to Descriptive Name 

In [25]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    lst.append(digraph.node[df.ix[index, 'GO ID']]['name'])
    
df['GO ID'] = lst

Progress: 100%  618023 Out of 618023   

In [26]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV2-28,receptor-mediated endocytosis
1,IGKV2-28,"complement activation, classical pathway"
2,IGKV2-28,regulation of complement activation
3,IGKV2-28,Fc-epsilon receptor signaling pathway
4,IGKV2-28,Fc-gamma receptor signaling pathway involved i...


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [27]:
df.set_index('DB Object Symbol', inplace=True)

In [28]:
mf.mapgenesymbols(df)

Progeres: 99%  618020 Out of 618023   

In [29]:
df.shape

# Drop Duplicates

In [30]:
df.reset_index(inplace=True)

In [31]:
df.drop_duplicates(inplace=True)

In [32]:
df.shape

(510389, 2)

# Create Binary Matrix

In [41]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  13334 Out of 13334   

In [42]:
binary_matrix.head()

Unnamed: 0,tRNA modification,Fc-gamma receptor signaling pathway,minus-end-directed organelle transport along microtubule,termination of mitochondrial transcription,regulation of ribonuclease activity,T-helper 1 cell activation,saturated monocarboxylic acid metabolic process,purine nucleotide interconversion,regulation of endoplasmic reticulum stress-induced neuron intrinsic apoptotic signaling pathway,cellular response to potassium ion,...,fibroblast growth factor receptor signaling pathway involved in somitogenesis,sinoatrial node development,"protein deglycation, methylglyoxal removal",positive regulation of slow-twitch skeletal muscle fiber contraction,positive regulation of interleukin-22 biosynthetic process,positive regulation of oxidative stress-induced intrinsic apoptotic signaling pathway,regulation of phagocytosis,calcium ion transmembrane import into cytosol,smooth muscle cell chemotaxis,viral protein processing
C9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
UBLCP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
RAD51C,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CD27,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CYGB,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [43]:
binary_matrix.shape

(13334, 17820)

# Save Binary Matrix

In [44]:
filename = '~/./Documents/Harmonizome/GO/output/go_biological_process_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [45]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [46]:
name = 'go_biological_process_gene_set'

In [47]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  17820 Out of 17820   

# Create Attribute Library

In [48]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [49]:
name = 'go_biological_process_attribute_set'

In [50]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  13334 Out of 13334   

# Create Gene Similarity Matrix

In [51]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [52]:
gene_similarity_matix.head()

Unnamed: 0,C9,UBLCP1,RAD51C,CD27,CYGB,OR4X2,PCGF6,ITGB3BP,ZNF701,ASAH1,...,DAZ1,TOX3,FIG4,ZGPAT,PMAIP1,PDCD10,FUNDC1,SCAMP2,PRKAA1,ATAT1
C9,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
UBLCP1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
RAD51C,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005,0.0
CD27,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.009009,0.0,0.0,0.005,0.105769,0.0,0.0,0.090047,0.0
CYGB,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix 

In [53]:
filename = '~/./Documents/Harmonizome/GO/output/go_biological_process_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [54]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [55]:
attribute_similarity_matix.head()

Unnamed: 0,tRNA modification,Fc-gamma receptor signaling pathway,minus-end-directed organelle transport along microtubule,termination of mitochondrial transcription,regulation of ribonuclease activity,T-helper 1 cell activation,saturated monocarboxylic acid metabolic process,purine nucleotide interconversion,regulation of endoplasmic reticulum stress-induced neuron intrinsic apoptotic signaling pathway,cellular response to potassium ion,...,fibroblast growth factor receptor signaling pathway involved in somitogenesis,sinoatrial node development,"protein deglycation, methylglyoxal removal",positive regulation of slow-twitch skeletal muscle fiber contraction,positive regulation of interleukin-22 biosynthetic process,positive regulation of oxidative stress-induced intrinsic apoptotic signaling pathway,regulation of phagocytosis,calcium ion transmembrane import into cytosol,smooth muscle cell chemotaxis,viral protein processing
tRNA modification,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Fc-gamma receptor signaling pathway,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.019608,0.0
minus-end-directed organelle transport along microtubule,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.020833
termination of mitochondrial transcription,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.022222,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
regulation of ribonuclease activity,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [56]:
filename = '~/./Documents/Harmonizome/GO/output/go_biological_process_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [57]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  13334 Out of 13334   

In [58]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,C9,735
1,UBLCP1,134510
2,RAD51C,5889
3,CD27,939
4,CYGB,114757


In [59]:
gene_list.shape

(13334, 2)

# Save Gene List

In [60]:
filename = '~/./Documents/Harmonizome/GO/output/go_biological_process_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [61]:
attribute_list = mf.createAttributeList(binary_matrix)

In [62]:
attribute_list.head()

Unnamed: 0,Attributes
0,tRNA modification
1,Fc-gamma receptor signaling pathway
2,minus-end-directed organelle transport along m...
3,termination of mitochondrial transcription
4,regulation of ribonuclease activity


In [63]:
attribute_list.shape

(17820, 1)

# Save Attribute List

In [64]:
filename = '~/./Documents/Harmonizome/GO/output/go_biological_process_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [65]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [66]:
name = 'go_biological_process_gene_attribute_edge_list'

In [67]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  17820 Out of 17820   

 The number of statisticaly relevent gene-attribute associations is: 510389
