# Gene Ontology (GO) Cellular Component

Author: Moshe Silverstein <br/>
Date: 7-17 <br/>
Data Downloaded: 04-2017 <br/>
Data Source: http://geneontology.org/gene-associations/goa_human.gaf.gz

In [1]:
import sys, datetime
import numpy as np
import pandas as pd
import importlib
import my_functions as mf
import goenrich
%matplotlib inline

In [2]:
importlib.reload(mf)

<module 'my_functions' from '/Users/moshesilverstein/Documents/Harmonizome/GO/my_functions.py'>

# Load Data

In [3]:
df = pd.read_csv('input/goa_human.gaf', sep='\t', skiprows=34, header=None, index_col=False)

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


In [5]:
df.shape

(417076, 17)

# Get Relevent Columns and Data

In [6]:
columns = ["DB", "DB Object ID", "DB Object Symbol", "Qualifier", "GO ID", "DB:Reference", "Evidence Code", 
               "With (or) From", "Aspect", "DB Object Name", "DB Object Synonym", "DB Object Type", "Taxon", 
               "Date", "Assigned By", "Annotation Extension", "Gene Product Form ID"]

In [7]:
df.columns = columns

In [8]:
df.head()

Unnamed: 0,DB,DB Object ID,DB Object Symbol,Qualifier,GO ID,DB:Reference,Evidence Code,With (or) From,Aspect,DB Object Name,DB Object Synonym,DB Object Type,Taxon,Date,Assigned By,Annotation Extension,Gene Product Form ID
0,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0004871,GO_REF:0000038,IEA,UniProtKB-KW:KW-0807,F,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,UniProt,,
1,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0005834,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,C,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
2,UniProtKB,A0A024R161,DNAJC25-GNG10,,GO:0007186,GO_REF:0000002,IEA,InterPro:IPR001770|InterPro:IPR015898,P,Guanine nucleotide-binding protein subunit gamma,A0A024R161_HUMAN|DNAJC25-GNG10|hCG_1994888,protein,taxon:9606,20170506,InterPro,,
3,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0002377,GO_REF:0000033,IBA,PANTHER:PTN000587099,P,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,
4,UniProtKB,A0A075B6H7,IGKV3-7,,GO:0005615,GO_REF:0000033,IBA,PANTHER:PTN000587099,C,Immunoglobulin kappa variable 3-7 (non-functio...,A0A075B6H7_HUMAN|IGKV3-7,protein,taxon:9606,20150528,GO_Central,,


#### Get Only Cellular Component Data

In [9]:
df = df[df['Aspect']=='C'].copy()

#### Drop All Data That Is Inferred from Electronic Annotation

In [10]:
df = df[~df['Evidence Code'].isin(['IEA'])]

#### Drop All 'Non' in Qualifier 

In [11]:
df = df[~df['Qualifier'].isin(['NOT'])]

In [12]:
df = df[['DB Object Symbol', 'GO ID']]

In [13]:
df.reset_index(inplace=True)

In [14]:
df.drop('index', axis=1, inplace=True)

In [15]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,IGKV3-7,GO:0005615
1,IGKV1D-42,GO:0005615
2,IGLV4-69,GO:0005615
3,IGLV8-61,GO:0005615
4,IGLV4-60,GO:0005615


In [16]:
df.shape

(120758, 2)

# Load Gene Ontology Tree Digraph

In [17]:
digraph = goenrich.obo.ontology('input/go-basic.ob')

# Keep Only Terms With a Tree Depth of 4 or Greater

In [18]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.ix[index, 'GO ID']
    if term in digraph.node:
        if digraph.node[term]['depth'] >= 4:
            lst.append(term)
        else:
            lst.append(np.nan)
    else:
        lst.append(np.nan)

df['GO ID'] = lst

Progress: 99%  120534 Out of 120758   

In [19]:
df.shape

(120758, 2)

In [20]:
df.dropna(inplace=True)

In [21]:
df.shape

(69139, 2)

In [22]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
63,A0A075B6Q4,GO:0005634
64,A0A075B6Q4,GO:0030688
65,A0A075B6Q4,GO:0031902
152,A0A087WVE0,GO:0005634
158,A0A087WWV3,GO:0005634


# Term Propagation-propergate child gene term relationships to parent terms

In [23]:
lst1 = []
lst2 = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    term = df.ix[index, 'GO ID']
    for parent in digraph.predecessors(term):
        if parent in digraph.node:
            if digraph.node[parent]['depth'] >= 4:
                lst1.append(df.ix[index, 'DB Object Symbol'])
                lst2.append(parent)


temp = pd.DataFrame()
temp['DB Object Symbol'] = lst1
temp['GO ID']  = lst2
df = pd.concat([df, temp])
df.reset_index(inplace=True)
df.drop('index', axis=1, inplace=True)

Progress: 99%  69067 Out of 69139   

In [24]:
df.shape

(333911, 2)

# Map GO ID to Descriptive Name 

In [25]:
lst = []

for i,index in enumerate(df.index):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    lst.append(digraph.node[df.ix[index, 'GO ID']]['name'])
    
df['GO ID'] = lst

Progress: 99%  333393 Out of 333911   

In [26]:
df.head()

Unnamed: 0,DB Object Symbol,GO ID
0,A0A075B6Q4,nucleus
1,A0A075B6Q4,"preribosome, small subunit precursor"
2,A0A075B6Q4,late endosome membrane
3,A0A087WVE0,nucleus
4,A0A087WWV3,nucleus


# Map Gene Symbols To Up-to-date Approved Gene Symbols

In [29]:
df.set_index('DB Object Symbol', inplace=True)

In [30]:
mf.mapgenesymbols(df)

Progeres: 100%  333911 Out of 333911   

In [31]:
df.shape

(332819, 1)

# Drop Duplicates

In [32]:
df.reset_index(inplace=True)

In [33]:
df.drop_duplicates(inplace=True)

In [34]:
df.shape

(188564, 2)

# Create Binary Matrix

In [35]:
binary_matrix = mf.createBinaryMatix(df)

Progeres: 100%  11532 Out of 11532   

In [36]:
binary_matrix.head()

Unnamed: 0,"vacuolar proton-transporting V-type ATPase, V1 domain",extrinsic component of autophagosome membrane,junctional sarcoplasmic reticulum membrane,Barr body,cytoplasmic ribonucleoprotein granule,NatA complex,phagocytic vesicle lumen,TEAD-2-YAP complex,chromaffin granule,vacuolar proton-transporting V-type ATPase complex,...,endoplasmic reticulum tubular network,axonemal outer doublet,actin cap,integrin alpha3-beta1 complex,microsporocyte nucleus,bicellular tight junction,cytolytic granule,generative cell nucleus,telomerase catalytic core complex,PBAF complex
ADAM22,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
NDUFA1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
FBXO3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
KCNQ2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
PIK3AP1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
binary_matrix.shape

(11532, 1275)

# Save Binary Matrix

In [38]:
filename = '~/./Documents/Harmonizome/GO/output/go_cellular_component_binary_matrix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
binary_matrix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene Set Library

In [39]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [40]:
name = 'go_cellular_component_gene_set'

In [41]:
mf.createUpGeneSetLib(binary_matrix, path, name)

Progeres: 100%  1275 Out of 1275   

# Create Attribute Library

In [42]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [43]:
name = 'go_cellular_component_attribute_set'

In [44]:
mf.createUpAttributeSetLib(binary_matrix, path, name)

Progeres: 100%  11532 Out of 11532   

# Create Gene Similarity Matrix

In [45]:
gene_similarity_matix = mf.createSimilarityMatrix(binary_matrix, 'jaccard')

In [46]:
gene_similarity_matix.head()

Unnamed: 0,ADAM22,NDUFA1,FBXO3,KCNQ2,PIK3AP1,BEST2,CDH2,ATP5I,HERC5,B3GALT6,...,GBF1,MLIP,DDX3Y,FUK,EFCAB6,GNRHR2,BCL11B,ZYG11B,GAK,OXSR1
ADAM22,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
NDUFA1,0.0,1.0,0.2,0.0,0.230769,0.0,0.0,0.4375,0.230769,0.0,...,0.136364,0.0,0.090909,0.230769,0.0,0.0,0.0,0.0,0.230769,0.230769
FBXO3,0.0,0.2,1.0,0.0,0.6,0.0,0.0,0.0,0.6,0.0,...,0.214286,0.071429,0.12,0.6,0.4,0.0,0.0,0.0,0.6,0.6
KCNQ2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
PIK3AP1,0.0,0.230769,0.6,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.25,0.0,0.130435,1.0,0.0,0.0,0.0,0.0,1.0,1.0


# Save Gene Similarity Matrix 

In [47]:
filename = '~/./Documents/Harmonizome/GO/output/go_cellular_component_gene_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Attribute Similarity matrix

In [48]:
attribute_similarity_matix = mf.createSimilarityMatrix(binary_matrix.T, 'jaccard')

In [49]:
attribute_similarity_matix.head()

Unnamed: 0,"vacuolar proton-transporting V-type ATPase, V1 domain",extrinsic component of autophagosome membrane,junctional sarcoplasmic reticulum membrane,Barr body,cytoplasmic ribonucleoprotein granule,NatA complex,phagocytic vesicle lumen,TEAD-2-YAP complex,chromaffin granule,vacuolar proton-transporting V-type ATPase complex,...,endoplasmic reticulum tubular network,axonemal outer doublet,actin cap,integrin alpha3-beta1 complex,microsporocyte nucleus,bicellular tight junction,cytolytic granule,generative cell nucleus,telomerase catalytic core complex,PBAF complex
"vacuolar proton-transporting V-type ATPase, V1 domain",1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
extrinsic component of autophagosome membrane,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
junctional sarcoplasmic reticulum membrane,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Barr body,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.005457,0.0,0.0,0.005457,0.0,0.038462
cytoplasmic ribonucleoprotein granule,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.004901,0.0,0.0,0.004901,0.019231,0.0


# Save Attribute Similarity Matrix

In [50]:
filename = '~/./Documents/Harmonizome/GO/output/go_cellular_component_attribute_similarity_matix_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_similarity_matix.to_csv(filename, sep='\t', compression='gzip')

# Create Gene List

In [51]:
gene_list = mf.createGeneList(binary_matrix)

Progeres: 100%  11532 Out of 11532   

In [52]:
gene_list.head()

Unnamed: 0,GeneSym,GeneID
0,ADAM22,53616
1,NDUFA1,4694
2,FBXO3,26273
3,KCNQ2,3785
4,PIK3AP1,118788


In [53]:
gene_list.shape

(11532, 2)

# Save Gene List

In [54]:
filename = '~/./Documents/Harmonizome/GO/output/go_cellular_component_gene_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
gene_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Attribute List 

In [55]:
attribute_list = mf.createAttributeList(binary_matrix)

In [56]:
attribute_list.head()

Unnamed: 0,Attributes
0,"vacuolar proton-transporting V-type ATPase, V1..."
1,extrinsic component of autophagosome membrane
2,junctional sarcoplasmic reticulum membrane
3,Barr body
4,cytoplasmic ribonucleoprotein granule


In [57]:
attribute_list.shape

(1275, 1)

# Save Attribute List

In [58]:
filename = '~/./Documents/Harmonizome/GO/output/go_cellular_component_attribute_list_%s.tsv.zip'% str(datetime.date.today())[0:7].replace('-', '_')
attribute_list.to_csv(filename, sep='\t', index=False, compression='gzip')

# Create Gene-Attribute Edge List

In [59]:
path = '/Users/moshesilverstein/Documents/Harmonizome/GO/output/'

In [60]:
name = 'go_cellular_component_gene_attribute_edge_list'

In [61]:
mf.createGeneAttributeEdgeList(binary_matrix, gene_list, path, name)

Progeres: 100%  1275 Out of 1275   

 The number of statisticaly relevent gene-attribute associations is: 188564
