# Gene Ontology (GO) Cellular Component

Author: Zachary Flamholz  
Date: 06-2018  
Database: http://flybase.org/   
Data: http://geneontology.org/gene-associations/gene_association.fb.gz    
Companion file: http://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Invertebrates/Drosophila_melanogaster.gene_info.gz, http://snapshot.geneontology.org/ontology/go-basic.obo    

In [1]:
import numpy as np
import pandas as pd
import sys, datetime
import goenrich
import scipy.stats as stat

# Versions of Modules in Use

In [None]:
%load_ext version_information
%version_information numpy, pandas

## read in data

In [2]:
df = pd.read_csv('goa_human.gaf', sep='`', skiprows=41, header=None)

In [3]:
df.head()

Unnamed: 0,0
0,UniProtKB\tA0A024RBG1\tNUDT4B\tenables\tGO:000...
1,UniProtKB\tA0A024RBG1\tNUDT4B\tenables\tGO:004...
2,UniProtKB\tA0A024RBG1\tNUDT4B\tenables\tGO:005...
3,UniProtKB\tA0A024RBG1\tNUDT4B\tenables\tGO:005...
4,UniProtKB\tA0A024RBG1\tNUDT4B\tlocated_in\tGO:...


In [4]:
## code copied from Gene Ontology (GO) Biological component by Moshe Silverstein 

matrix = np.chararray((df.shape[0], 17), itemsize=150, unicode=True)

for i, row in enumerate(df.itertuples()):
    
    progressPercent = ((i+1)/len(df.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
    sys.stdout.flush()
    
    lst = row[1].split('\t')

    matrix[i, :] = lst

df_clean = pd.DataFrame(data=matrix)

Progress: 100%  619347 Out of 619347   

In [5]:
df_clean.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
4,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,,
5,UniProtKB,A0A075B6H7,IGKV3-7,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20210417,UniProt,,
6,UniProtKB,A0A075B6H7,IGKV3-7,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20210417,UniProt,,
7,UniProtKB,A0A075B6H7,IGKV3-7,part_of,GO:0019814,GO_REF:0000043,IEA,UniProtKB-KW:KW-1280,C,Probable non-functional immunoglobulin kappa v...,IGKV3-7,protein,taxon:9606,20210417,UniProt,,
8,UniProtKB,A0A075B6H8,IGKV1D-42,involved_in,GO:0002250,GO_REF:0000043,IEA,UniProtKB-KW:KW-1064,P,Probable non-functional immunoglobulin kappa v...,IGKV1D-42,protein,taxon:9606,20210417,UniProt,,
9,UniProtKB,A0A075B6H8,IGKV1D-42,located_in,GO:0005886,GO_REF:0000044,IEA,UniProtKB-SubCell:SL-0039,C,Probable non-functional immunoglobulin kappa v...,IGKV1D-42,protein,taxon:9606,20210417,UniProt,,


In [6]:
columns = ["DB", "DB gene ID", "Gene symbol", "Qualifier", "GO ID", "Reference", "Evidence code", "Evidence from", "GO class", "attribute", "Locus tag", "gene/protein", "tax id", "date", "Assigned by", "additional information", "empty"]

In [7]:
df_clean.columns = columns

In [8]:
df_clean.head()

Unnamed: 0,DB,DB gene ID,Gene symbol,Qualifier,GO ID,Reference,Evidence code,Evidence from,GO class,attribute,Locus tag,gene/protein,tax id,date,Assigned by,additional information,empty
0,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0003723,GO_REF:0000043,IEA,UniProtKB-KW:KW-0694,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
1,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0046872,GO_REF:0000043,IEA,UniProtKB-KW:KW-0479,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
2,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052840,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
3,UniProtKB,A0A024RBG1,NUDT4B,enables,GO:0052842,GO_REF:0000003,IEA,EC:3.6.1.52,F,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20210417,UniProt,,
4,UniProtKB,A0A024RBG1,NUDT4B,located_in,GO:0005829,GO_REF:0000052,IDA,,C,Diphosphoinositol polyphosphate phosphohydrola...,NUDT4B,protein,taxon:9606,20161204,HPA,,


In [9]:
df_component = df_clean[df_clean["GO class"] == 'C'].copy()

In [10]:
df_component.shape

(171166, 17)

In [11]:
# remove any annotation assigned by electronic matching and with the NOT qualifier which is used to specify a gene is not associated with a term
df_component = df_component[df_component["Evidence code"] != 'IEA']
df_component = df_component[df_component["Qualifier"] != 'NOT']

In [12]:
df_component = df_component[['Gene symbol', 'GO ID']]
df_component.reset_index(inplace=True)
df_component = df_component.drop(columns = 'index')

In [13]:
df_component.shape

(153396, 2)

## load GO digraph

In [14]:
digraph = goenrich.obo.ontology('go-basic.obo')

## only keep GO IDs with depth =>4, replace others with NaN and then remove those rows

In [15]:
lst = []

In [16]:
## code copied from Gene Ontology (GO) Biological component by Moshe Silverstein 
for i, index in enumerate(df_component.index):
    
    progressPercent = ((i+1)/len(df_component.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df_component.index)))
    sys.stdout.flush()
    try:
        term = df_component.loc[index, 'GO ID']
        if term in digraph.nodes:
            if digraph.nodes[term]['depth'] >= 4:
                lst.append(term)
            else:
                lst.append(np.nan)
        else:
            lst.append(np.nan)
    except:
        break

Progress: 100%  153396 Out of 153396   

In [17]:
df_component['GO ID'] = lst

In [18]:
df_component.shape

(153396, 2)

In [19]:
df_component.dropna(inplace=True)

In [20]:
df_component.shape

(38229, 2)

In [21]:
df_component.head()

Unnamed: 0,Gene symbol,GO ID
1,TRBV19,GO:0042105
122,TRAV27,GO:0042105
124,MEIKIN,GO:0000779
144,TRAV19,GO:0005887
146,PIGBOS1,GO:0031307


## term propagation-propergate child gene term relationships to parent terms

In [22]:
## code copied from Gene Ontology (GO) Biological component by Moshe Silverstein 
lst1 = []
lst2 = []

for i,index in enumerate(df_component.index):
    
#     progressPercent = ((i+1)/len(df.index))*100

#     sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df.index)))
#     sys.stdout.flush()
    
#     term = df.loc[index, 'GO ID']
#     for parent in digraph.predecessors(term):
#         if parent in digraph.node:
#             if digraph.node[parent]['depth'] >= 4:
#                 lst1.append(df.loc[index, 'DB Object Symbol'])
#                 lst2.append(parent)
#                 print(term, parent)
    term = df_component.loc[index, 'GO ID']
    for parent in digraph.successors(term):
        if parent in digraph.nodes:
            if digraph.nodes[parent]['depth'] >= 4:
                lst1.append(df_component.loc[index, 'Gene symbol'])
                lst2.append(parent)
#                 print(term, parent)


temp = pd.DataFrame()
temp['Gene symbol'] = lst1
temp['GO ID']  = lst2
df_component = pd.concat([df_component, temp])
df_component.reset_index(inplace=True)
df_component = df_component.drop(columns = 'index')

In [23]:
df_component.shape

(76064, 2)

In [24]:
df_component.head()

Unnamed: 0,Gene symbol,GO ID
0,TRBV19,GO:0042105
1,TRAV27,GO:0042105
2,MEIKIN,GO:0000779
3,TRAV19,GO:0005887
4,PIGBOS1,GO:0031307


## map GO ID to descriptive name

In [25]:
lst = []

In [None]:
## code copied from Gene Ontology (GO) Biological component by Moshe Silverstein 
for i,index in enumerate(df_component.index):
    
    progressPercent = ((i+1)/len(df_component.index))*100

    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(df_component.index)))
    sys.stdout.flush()
    try:
        lst.append(str(digraph.nodes[df_component.loc[index, 'GO ID']]['name'])+' '+ '('+str(df_component.loc[index, 'GO ID'])+')')
    except:
        break

Progress: 99%  76034 Out of 76064   

In [27]:
df_component['GO ID'] = lst

In [28]:
df_component.head()

Unnamed: 0,Gene symbol,GO ID
0,TRBV19,alpha-beta T cell receptor complex (GO:0042105)
1,TRAV27,alpha-beta T cell receptor complex (GO:0042105)
2,MEIKIN,"condensed chromosome, centromeric region (GO:0..."
3,TRAV19,integral component of plasma membrane (GO:0005...
4,PIGBOS1,integral component of mitochondrial outer memb...


In [29]:
df_component.drop_duplicates(inplace=True)

In [30]:
df_component.head()

Unnamed: 0,Gene symbol,GO ID
0,TRBV19,alpha-beta T cell receptor complex (GO:0042105)
1,TRAV27,alpha-beta T cell receptor complex (GO:0042105)
2,MEIKIN,"condensed chromosome, centromeric region (GO:0..."
3,TRAV19,integral component of plasma membrane (GO:0005...
4,PIGBOS1,integral component of mitochondrial outer memb...


In [31]:
df_component.reset_index(inplace=True)

In [32]:
df_component.head()

Unnamed: 0,index,Gene symbol,GO ID
0,0,TRBV19,alpha-beta T cell receptor complex (GO:0042105)
1,1,TRAV27,alpha-beta T cell receptor complex (GO:0042105)
2,2,MEIKIN,"condensed chromosome, centromeric region (GO:0..."
3,3,TRAV19,integral component of plasma membrane (GO:0005...
4,4,PIGBOS1,integral component of mitochondrial outer memb...


In [33]:
df_component = df_component.drop(columns = 'index')

In [34]:
df_component.head()

Unnamed: 0,Gene symbol,GO ID
0,TRBV19,alpha-beta T cell receptor complex (GO:0042105)
1,TRAV27,alpha-beta T cell receptor complex (GO:0042105)
2,MEIKIN,"condensed chromosome, centromeric region (GO:0..."
3,TRAV19,integral component of plasma membrane (GO:0005...
4,PIGBOS1,integral component of mitochondrial outer memb...


In [35]:
df_component.shape

(42630, 2)

In [36]:
number_terms = len(df_component.iloc[:, 1].unique())
number_terms

926

## build the protein-coding gene list

In [37]:
dMelanogaster_geneInfo = pd.read_csv("Homo_sapiens.gene_info", sep="\t")

In [38]:
dMelanogaster_geneInfo.head()

Unnamed: 0,#tax_id,GeneID,Symbol,LocusTag,Synonyms,dbXrefs,chromosome,map_location,description,type_of_gene,Symbol_from_nomenclature_authority,Full_name_from_nomenclature_authority,Nomenclature_status,Other_designations,Modification_date,Feature_type
0,9606,1,A1BG,-,A1B|ABG|GAB|HYST2477,MIM:138670|HGNC:HGNC:5|Ensembl:ENSG00000121410,19,19q13.43,alpha-1-B glycoprotein,protein-coding,A1BG,alpha-1-B glycoprotein,O,alpha-1B-glycoprotein|HEL-S-163pA|epididymis s...,20210518,-
1,9606,2,A2M,-,A2MD|CPAMD5|FWP007|S863-7,MIM:103950|HGNC:HGNC:7|Ensembl:ENSG00000175899,12,12p13.31,alpha-2-macroglobulin,protein-coding,A2M,alpha-2-macroglobulin,O,alpha-2-macroglobulin|C3 and PZP-like alpha-2-...,20210524,-
2,9606,3,A2MP1,-,A2MP,HGNC:HGNC:8|Ensembl:ENSG00000256069,12,12p13.31,alpha-2-macroglobulin pseudogene 1,pseudo,A2MP1,alpha-2-macroglobulin pseudogene 1,O,pregnancy-zone protein pseudogene,20210518,-
3,9606,9,NAT1,-,AAC1|MNAT|NAT-1|NATI,MIM:108345|HGNC:HGNC:7645|Ensembl:ENSG00000171428,8,8p22,N-acetyltransferase 1,protein-coding,NAT1,N-acetyltransferase 1,O,arylamine N-acetyltransferase 1|N-acetyltransf...,20210607,-
4,9606,10,NAT2,-,AAC2|NAT-2|PNAT,MIM:612182|HGNC:HGNC:7646|Ensembl:ENSG00000156006,8,8p22,N-acetyltransferase 2,protein-coding,NAT2,N-acetyltransferase 2,O,arylamine N-acetyltransferase 2|N-acetyltransf...,20210518,-


In [39]:
dMelanogaster_geneInfo.shape

(62728, 16)

In [40]:
dMelanogaster_proteinCoding = dMelanogaster_geneInfo[dMelanogaster_geneInfo["type_of_gene"] == "protein-coding"]

In [41]:
dMelanogaster_proteinCoding.shape

(19688, 16)

## only keep gene symbolys that are in the S cerevisiae table

In [42]:
df_component_protein = df_component.loc[df_component["Gene symbol"].isin(dMelanogaster_proteinCoding["Symbol"].unique())]

In [43]:
df_component_protein.shape

(41947, 2)

In [45]:
filename = 'GO_Cellular_Component_2021.gmt'
file = open(filename,'w+') 
terms = df_component_protein["GO ID"].unique()
for i,term in enumerate(terms):
    
    progressPercent = ((i+1)/len(terms))*100
    sys.stdout.write("Progress: %d%%  %d Out of %d   \r" % (progressPercent, (i+1), len(terms)))
    sys.stdout.flush()
            
    df_byTerm = df_component_protein.loc[df_component_protein["GO ID"] == term]
    
    if df_byTerm.shape[0] > 4:
        # split splice variant names
        split_splice = lambda x: x.split('.')[0]
        df_byTerm["Gene symbol"] = df_byTerm["Gene symbol"].apply(split_splice)
        
        
        if len(df_byTerm["Gene symbol"].unique()) > 4:
            file.write("%s\t" % term)
            file.write("\t")
            genes = df_byTerm["Gene symbol"].unique()
    
            for gene in genes:
                file.write("%s\t" % gene)
            file.write("\n")
        
file.close()

Progress: 3%  32 Out of 926   

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_byTerm["Gene symbol"] = df_byTerm["Gene symbol"].apply(split_splice)


Progress: 100%  926 Out of 926   