# Gene Ontology (GO) Molecular Function

Author: Moshe Silverstein <br/>
Date: 03-2018 <br/>
Data Downloaded: 03-2018 <br/>
Data Source: http://geneontology.org/gene-associations/goa_human.gaf.gz

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import untility_functions as uf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(uf)

<module 'untility_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GO/untility_functions.py'>

# Versions Of Modules In Use

In [3]:
%load_ext version_information
%version_information numpy, pandas, clustergrammer_widget 

Software,Version
Python,3.5.2 64bit [GCC 4.2.1 Compatible Apple LLVM 4.2 (clang-425.0.28)]
IPython,5.3.0
OS,Darwin 17.2.0 x86_64 i386 64bit
numpy,1.13.1
pandas,0.21.0
clustergrammer_widget,1.9.0
Thu Mar 22 16:55:18 2018 EDT,Thu Mar 22 16:55:18 2018 EDT


# Load Data

In [4]:
df1 = pd.read_csv('input/goa_human.gaf', sep='~', skiprows=23, header=None, index_col=False)

In [5]:
df1.head()

Unnamed: 0,0
0,UniProtKB\tA0A024R161\tDNAJC25-GNG10\t\tGO:000...
1,UniProtKB\tA0A024R161\tDNAJC25-GNG10\t\tGO:000...
2,UniProtKB\tA0A024R161\tDNAJC25-GNG10\t\tGO:000...
3,UniProtKB\tA0A075B6H7\tIGKV3-7\t\tGO:0002377\t...
4,UniProtKB\tA0A075B6H7\tIGKV3-7\t\tGO:0005615\t...


In [6]:
# df = pd.DataFrame()

matrix = np.chararray((df1.shape[0], 16), itemsize=150, unicode=True)

# for i, index in enumerate(df1.index):
for i, row in enumerate(df1.itertuples()):
    
    progressPercent = ((i+1)/len(df1.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df1.index)))
    sys.stdout.flush()
    
#     lst = df1.loc[index, 0].split('\t')
    lst = row[1].split('\t')
    if len(lst) == 15:
        lst.append(np.nan)
    matrix[i, :] = lst
#     temp = pd.DataFrame(data = lst)
#     df = pd.concat([df, temp.T])

df = pd.DataFrame(data=matrix)

Progress: 99%  495871 Out of 496852   

In [7]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20180224,UniProt,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898|InterPro...,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20180224,InterPro,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898|InterPro...,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20180224,InterPro,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,


In [8]:
df.shape

(496852, 16)

# Get Relevent Columns and Data

In [9]:
columns = ["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence Code", 
               "With (or) From", "Aspect", "DB Object Name", "DB Object Synonym", "DB Object Type", "Taxon", 
               "Date", "Assigned By", "Annotation Extension"]

In [10]:
df.columns = columns

In [11]:
df.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With (or) From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20180224,UniProt,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898|InterPro...,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20180224,InterPro,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898|InterPro...,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20180224,InterPro,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,


#### Get Only Molecular Function Data

In [12]:
df = df[df['Aspect']=='F'].copy()

#### Drop All Data That Is Inferred from Electronic Annotation

In [13]:
df = df[~df['Evidence Code'].isin(['IEA'])]

#### Drop All 'Non' in Qualifier 

In [14]:
df = df[~df['Qualifier'].isin(['NOT'])]

In [15]:
df = df[['DB Object Symbol', 'GO ID']]

In [16]:
df.reset_index(inplace=True)

In [17]:
df.drop('index', axis=1, inplace=True)

In [18]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV2-28,GO:0004252
1,IGKV2-28,GO:0004252
2,IGHV3-64,GO:0003823
3,IGHV3-64,GO:0034987
4,IGHV4-4,GO:0003823


In [19]:
df.shape

(152328, 2)

# Load Gene Ontology Tree Digraph

In [20]:
digraph = goenrich.obo.ontology('input/go-basic.ob')

# Keep Only Terms With a Tree Depth of 4 or Greaterm

In [21]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.loc[index, 'GO ID']
    if term in digraph.node:
        if digraph.node[term]['depth'] >= 4:
            lst.append(term)
        else:
            lst.append(np.nan)
    else:
        lst.append(np.nan)

df['GO ID'] = lst

Progress: 100%  152328 Out of 152328   

In [22]:
df.shape

(152328, 2)

In [23]:
df.dropna(inplace=True)

In [24]:
df.shape

(46497, 2)

In [25]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV2-28,GO:0004252
1,IGKV2-28,GO:0004252
3,IGHV3-64,GO:0034987
5,IGHV4-4,GO:0034987
6,IGKV2D-30,GO:0004252


# Term Propagation-propergate child gene term relationships to parent terms

In [26]:
lst1 = []
lst2 = []

for i,index in enumerate(df.index):
    
#     progressPercent = ((i+1)/len(df.index))*100

#     sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
#     sys.stdout.flush()
    
#     term = df.loc[index, 'GO ID']
#     for parent in digraph.predecessors(term):
#         if parent in digraph.node:
#             if digraph.node[parent]['depth'] >= 4:
#                 lst1.append(df.loc[index, 'DB Object Symbol'])
#                 lst2.append(parent)
#                 print(term, parent)
    term = df.loc[index, 'GO ID']
    for parent in digraph.successors(term):
        if parent in digraph.node:
            if digraph.node[parent]['depth'] >= 4:
                lst1.append(df.loc[index, 'DB Object Symbol'])
                lst2.append(parent)
#                 print(term, parent)


temp = pd.DataFrame()
temp['DB Object Symbol'] = lst1
temp['GO ID']  = lst2
df = pd.concat([df, temp])
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

In [27]:
df.shape

(88786, 2)

# Map GO ID to Descriptive Name 

In [28]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    lst.append(str(digraph.node[df.ix[index, 'GO ID']]['name'])+' '+ '('+str(df.loc[index, 'GO ID'])+')')
    
df['GO ID'] = lst

Progress: 0%  221 Out of 88786   

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated


Progress: 99%  88424 Out of 88786   

In [29]:
df.head()

# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [30]:
df.set_index('DB Object Symbol', inplace=True)

In [31]:
uf.mapgenesymbols(df)

Progeres: 99%  88361 Out of 88786   

In [32]:
df.shape

# Drop Duplicates

In [33]:
df.reset_index(inplace=True)

In [34]:
df.drop_duplicates(inplace=True)

In [35]:
df.shape

# Create Binary Matrix

In [36]:
binary_matrix = uf.createBinaryMatrix(df)

Progeres: 100%  11739 Out of 11739   

In [37]:
binary_matrix.head()

Unnamed: 0,phenylalanine-tRNA ligase activity (GO:0004826),6-phosphogluconolactonase activity (GO:0017057),semaphorin receptor activity (GO:0017154),7SK snRNA binding (GO:0097322),androgen receptor binding (GO:0050681),transcription regulatory region DNA binding (GO:0044212),JUN kinase kinase kinase activity (GO:0004706),inositol monophosphate 3-phosphatase activity (GO:0052832),protein phosphatase inhibitor activity (GO:0004864),telomeric repeat-containing RNA binding (GO:0061752),...,ubiquitin-like protein ligase binding (GO:0044389),pyridoxal phosphatase activity (GO:0033883),oxytocin receptor activity (GO:0004990),sodium ion transmembrane transporter activity (GO:0015081),single-stranded DNA 3'-5' exodeoxyribonuclease activity (GO:0008310),annealing helicase activity (GO:0036310),tocopherol omega-hydroxylase activity (GO:0052870),epoxide hydrolase activity (GO:0004301),type 1 fibroblast growth factor receptor binding (GO:0005105),interferon receptor activity (GO:0004904)
CR2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CHFR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
OR8U8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HERC6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
LTBP2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
binary_matrix.shape

(11739, 3618)

# Save Binary Matrix

In [39]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [40]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [41]:
name = 'go_molecular_function_gene_set'

In [42]:
uf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  3618 Out of 3618   

# Create Attribute Library

In [43]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [44]:
name = 'go_molecular_function_attribute_set'

In [45]:
uf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  11739 Out of 11739   

# Create Gene Similarity Matrix

In [46]:
gene_similarity_matix = uf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [47]:
gene_similarity_matix.head()

Unnamed: 0,CR2,CHFR,OR8U8,HERC6,LTBP2,GIMAP7,SOX3,MICAL2,LGALS3,SKP2,...,EEF1E1,DYNLRB1,COQ9,CDK2,ADIPOQ,SPAG5,NOTCH4,TMPRSS13,SRSF4,PPFIBP1
,,,,,,,,,,,,,,,,,,,,,
CR2,1.0,0.0,0.0,0.0,0.0,0.111111,0.1,0.0,0.0,0.0,...,0.0,0.0,0.333333,0.0,0.2,0.0,0.0,0.0,0.0,0.0
CHFR,0.0,1.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
OR8U8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HERC6,0.0,0.333333,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.333333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LTBP2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Gene Similarity Matrix 

In [48]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [49]:
attribute_similarity_matix = uf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [50]:
attribute_similarity_matix.head()

Unnamed: 0,phenylalanine-tRNA ligase activity (GO:0004826),6-phosphogluconolactonase activity (GO:0017057),semaphorin receptor activity (GO:0017154),7SK snRNA binding (GO:0097322),androgen receptor binding (GO:0050681),transcription regulatory region DNA binding (GO:0044212),JUN kinase kinase kinase activity (GO:0004706),inositol monophosphate 3-phosphatase activity (GO:0052832),protein phosphatase inhibitor activity (GO:0004864),telomeric repeat-containing RNA binding (GO:0061752),...,ubiquitin-like protein ligase binding (GO:0044389),pyridoxal phosphatase activity (GO:0033883),oxytocin receptor activity (GO:0004990),sodium ion transmembrane transporter activity (GO:0015081),single-stranded DNA 3'-5' exodeoxyribonuclease activity (GO:0008310),annealing helicase activity (GO:0036310),tocopherol omega-hydroxylase activity (GO:0052870),epoxide hydrolase activity (GO:0004301),type 1 fibroblast growth factor receptor binding (GO:0005105),interferon receptor activity (GO:0004904)
,,,,,,,,,,,,,,,,,,,,,
phenylalanine-tRNA ligase activity (GO:0004826),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6-phosphogluconolactonase activity (GO:0017057),0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
semaphorin receptor activity (GO:0017154),0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7SK snRNA binding (GO:0097322),0.0,0.0,0.0,1.0,0.0,0.002639,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
androgen receptor binding (GO:0050681),0.0,0.0,0.0,0.0,1.0,0.019608,0.0,0.0,0.0,0.023256,...,0.01497,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Save Attribute Similarity Matrix

In [51]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [52]:
gene_list = uf.createGeneList(binary_matrix)

Progeres: 100%  11739 Out of 11739   

In [53]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,CR2,1380
1,CHFR,55743
2,OR8U8,504189
3,HERC6,55008
4,LTBP2,4053


In [54]:
gene_list.shape

(11739, 2)

# Save Gene List

In [55]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [56]:
attribute_list = uf.createAttributeList(binary_matrix)

In [57]:
attribute_list.head()

phenylalanine-tRNA ligase activity (GO:0004826)
6-phosphogluconolactonase activity (GO:0017057)
semaphorin receptor activity (GO:0017154)
7SK snRNA binding (GO:0097322)
androgen receptor binding (GO:0050681)


In [58]:
attribute_list.shape

(3618, 0)

# Save Attribute List

In [59]:
filename = '~/./Documents/Harmonizome/GO/output/go_molecular_function_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [60]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [61]:
name = 'go_molecular_function_gene_attribute_edge_list'

In [62]:
uf.createGeneAttributeEdgeList(binary_matrix, attribute_list, gene_list, path, name)

Progeres: 100%  3618 Out of 3618   

 The number of statisticaly relevent gene-attribute associations is: 48304
