# Gene Ontology (GO) Molecular Function

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://geneontology.org/gene-associations/goa_human.gaf.gz

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GO/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('input/goa_human.gaf', sep='\t', skiprows=34, header=None, index_col=False)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


In [5]:
df.shape

(417076, 17)

# Get Relevent Columns and Data

In [6]:
columns = ["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence Code", 
               "With (or) From", "Aspect", "DB Object Name", "DB Object Synonym", "DB Object Type", "Taxon", 
               "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID"]

In [7]:
df.columns = columns

In [8]:
df.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With (or) From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


#### Get Only Molecular Function Data

In [9]:
df = df[df['Aspect']=='F'].copy()

#### Drop All Data That Is Inferred from Electronic Annotation

In [10]:
df = df[~df['Evidence Code'].isin(['IEA'])]

#### Drop All 'Non' in Qualifier 

In [11]:
df = df[~df['Qualifier'].isin(['NOT'])]

In [12]:
df = df[['DB Object Symbol', 'GO ID']]

In [13]:
df.reset_index(inplace=True)

In [14]:
df.drop('index', axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV2-28,GO:0004252
1,IGKV2-28,GO:0004252
2,IGHV3-64,GO:0003823
3,IGHV3-64,GO:0034987
4,IGHV4-4,GO:0003823


In [16]:
df.shape

(118343, 2)

# Load Gene Ontology Tree Digraph

In [17]:
digraph = goenrich.obo.ontology('input/go-basic.ob')

# Keep Only Terms With a Tree Depth of 4 or Greaterm

In [18]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.ix[index, 'GO ID']
    if term in digraph.node:
        if digraph.node[term]['depth'] >= 4:
            lst.append(term)
        else:
            lst.append(np.nan)
    else:
        lst.append(np.nan)

df['GO ID'] = lst

Progress: 100%  118343 Out of 118343   

In [19]:
df.shape

(118343, 2)

In [20]:
df.dropna(inplace=True)

In [21]:
df.shape

(34198, 2)

In [22]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV2-28,GO:0004252
1,IGKV2-28,GO:0004252
3,IGHV3-64,GO:0034987
5,IGHV4-4,GO:0034987
6,IGKV2D-30,GO:0004252


# Term Propagation-propergate child gene term relationships to parent terms

In [23]:
lst1 = []
lst2 = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.ix[index, 'GO ID']
    for parent in digraph.predecessors(term):
        if parent in digraph.node:
            if digraph.node[parent]['depth'] >= 4:
                lst1.append(df.ix[index, 'DB Object Symbol'])
                lst2.append(parent)


temp = pd.DataFrame()
temp['DB Object Symbol'] = lst1
temp['GO ID']  = lst2
df = pd.concat([df, temp])
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

Progress: 100%  34198 Out of 34198   

In [24]:
df.shape

(229223, 2)

# Map GO ID to Descriptive Name 

In [25]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    lst.append(digraph.node[df.ix[index, 'GO ID']]['name'])
    
df['GO ID'] = lst

Progress: 99%  228388 Out of 229223   

In [31]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV2-28,serine-type endopeptidase activity
1,IGKV2-28,serine-type endopeptidase activity
2,IGHV3-64,immunoglobulin receptor binding
3,IGHV4-4,immunoglobulin receptor binding
4,IGKV2D-30,serine-type endopeptidase activity


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [32]:
df.set_index('DB Object Symbol', inplace=True)

In [33]:
mf.mapgenesymbols(df)

Progeres: 100%  229223 Out of 229223   

In [34]:
df.shape

(228712, 1)

# Drop Duplicates

In [35]:
df.reset_index(inplace=True)

In [36]:
df.drop_duplicates(inplace=True)

In [37]:
df.shape

(157088, 2)

# Create Binary Matrix

In [38]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  10716 Out of 10716   

In [39]:
binary_matrix.head()

Unnamed: 0,polygalacturonase activity,NAD(P)+-protein-arginine ADP-ribosyltransferase activity,cholate 7-alpha-dehydrogenase activity,carbon monoxide sensor activity,phospholipase A2 activity,CCR9 chemokine receptor binding,D-glucose transmembrane transporter activity,11-hydroxythromboxane B2 dehydrogenase activity,methionine transmembrane transporter activity,isocitrate O-dihydroxycinnamoyltransferase activity,...,ubiquitin-specific protease activity involved in positive regulation of ERAD pathway,oligoxyloglucan reducing-end-specific cellobiohydrolase activity,G-protein coupled glutamate receptor binding,type 5A serotonin receptor binding,sn-1-glycerol-3-phosphate C16:0-DCA-CoA acyl transferase activity,"1,2-diacylglycerol 3-glucosyltransferase activity",type 4 somatostatin receptor binding,insulin-like growth factor-activated receptor activity,protein methyltransferase activity,sequence-specific DNA binding
PNPLA3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KRT18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ZGPAT,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
DNAJB4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LZTS1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [40]:
binary_matrix.shape

(10716, 7115)

# Save Binary Matrix

In [41]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [43]:
name = 'go_molecular_function_gene_set'

In [44]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  7115 Out of 7115   

# Create Attribute Library

In [45]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [46]:
name = 'go_molecular_function_attribute_set'

In [47]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  10716 Out of 10716   

# Create Gene Similarity Matrix

In [48]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [49]:
gene_similarity_matix.head()

Unnamed: 0,PNPLA3,KRT18,ZGPAT,DNAJB4,LZTS1,NET1,CARF,FCER1G,OR1C1,GPR157,...,KNG1,RPIA,BAHCC1,SLC6A5,PDIA6,KITLG,MAGEL2,CELA1,PPP2R2B,PGM3
PNPLA3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
KRT18,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ZGPAT,0.0,0.0,1.0,0.0,0.032258,0.0,0.032258,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DNAJB4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LZTS1,0.0,0.0,0.032258,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix 

In [50]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [51]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [52]:
attribute_similarity_matix.head()

Unnamed: 0,polygalacturonase activity,NAD(P)+-protein-arginine ADP-ribosyltransferase activity,cholate 7-alpha-dehydrogenase activity,carbon monoxide sensor activity,phospholipase A2 activity,CCR9 chemokine receptor binding,D-glucose transmembrane transporter activity,11-hydroxythromboxane B2 dehydrogenase activity,methionine transmembrane transporter activity,isocitrate O-dihydroxycinnamoyltransferase activity,...,ubiquitin-specific protease activity involved in positive regulation of ERAD pathway,oligoxyloglucan reducing-end-specific cellobiohydrolase activity,G-protein coupled glutamate receptor binding,type 5A serotonin receptor binding,sn-1-glycerol-3-phosphate C16:0-DCA-CoA acyl transferase activity,"1,2-diacylglycerol 3-glucosyltransferase activity",type 4 somatostatin receptor binding,insulin-like growth factor-activated receptor activity,protein methyltransferase activity,sequence-specific DNA binding
polygalacturonase activity,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NAD(P)+-protein-arginine ADP-ribosyltransferase activity,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cholate 7-alpha-dehydrogenase activity,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
carbon monoxide sensor activity,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
phospholipase A2 activity,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [53]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [54]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  10716 Out of 10716   

In [55]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,PNPLA3,80339
1,KRT18,3875
2,ZGPAT,84619
3,DNAJB4,11080
4,LZTS1,11178


In [56]:
gene_list.shape

(10716, 2)

# Save Gene List

In [57]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [58]:
attribute_list = mf.createAttributeList(binary_matrix)

In [59]:
attribute_list.head()

Unnamed: 0,Attributes
0,polygalacturonase activity
1,NAD(P)+-protein-arginine ADP-ribosyltransferas...
2,cholate 7-alpha-dehydrogenase activity
3,carbon monoxide sensor activity
4,phospholipase A2 activity


In [60]:
attribute_list.shape

(7115, 1)

# Save Attribute List

In [61]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [62]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [63]:
name = 'go_molecular_function_gene_attribute_edge_list'

In [64]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  7115 Out of 7115   

 The number of statisticaly relevent gene-attribute associations is: 157088
