# Gene Ontology (GO) Biological Process

In [None]:
import numpy as np
import pandas as pd
import sys, datetime
import goenrich
import scipy.stats as stat

In [None]:
df = pd.read_csv('http://current.geneontology.org/annotations/goa_human.gaf.gz', sep='`', skiprows=41, header=None)

In [None]:
df.head()

In [None]:
## code copied from Gene Ontology (GO) Biological Process by Moshe Silverstein 

matrix = np.chararray((df.shape[0], 17), itemsize=150, unicode=True)

for i, row in enumerate(df.itertuples()):
    matrix[i, :] = row[1].split('\t')

df_clean = pd.DataFrame(data=matrix)

In [None]:
df_clean.head(10)

In [None]:
columns = ["DB", "DB gene ID", "Gene symbol", "Qualifier", "GO ID", "Reference", "Evidence code", "Evidence from", "GO class", "attribute", "Locus tag", "gene/protein", "tax id", "date", "Assigned by", "additional information", "empty"]

In [None]:
df_clean.columns = columns

In [None]:
df_clean.head()

In [None]:
df_process = df_clean[df_clean["GO class"] == 'P'].copy()

In [None]:
df_process.shape

In [None]:
# remove any annotation assigned by electronic matching and with the NOT qualifier which is used to specify a gene is not associated with a term
df_process = df_process[df_process["Evidence code"] != 'IEA']
df_process = df_process[df_process["Qualifier"] != 'NOT']

In [None]:
df_process = df_process[['Gene symbol', 'GO ID']]
df_process.reset_index(inplace=True)
df_process = df_process.drop(columns = 'index')

In [None]:
df_process.shape

## Load GO digraph

http://snapshot.geneontology.org/ontology/go-basic.obo    

In [None]:
digraph = goenrich.obo.ontology('go-basic.obo')

## Keep GO ids with depth => 4

In [None]:
## code copied from Gene Ontology (GO) Biological Process by Moshe Silverstein 

lst = []

for i,index in enumerate(df_process.index):
    term = df_process.loc[index, 'GO ID']
    if term in digraph.nodes:
        if digraph.nodes[term]['depth'] >= 4:
            lst.append(term)
        else:
            lst.append(np.nan)
    else:
        lst.append(np.nan)
           
df_process['GO ID'] = lst

In [None]:
df_process.shape

In [None]:
df_process.dropna(inplace=True)

In [None]:
df_process.shape

In [None]:
df_process.head()

## Term propagation: child term relationships to parent terms

In [None]:
## code copied from Gene Ontology (GO) Biological Process by Moshe Silverstein 
lst1 = []
lst2 = []

for i,index in enumerate(df_process.index):
    term = df_process.loc[index, 'GO ID']
    for parent in digraph.successors(term):
        if parent in digraph.nodes:
            if digraph.nodes[parent]['depth'] >= 4:
                lst1.append(df_process.loc[index, 'Gene symbol'])
                lst2.append(parent)


temp = pd.DataFrame()
temp['Gene symbol'] = lst1
temp['GO ID']  = lst2
df_process = pd.concat([df_process, temp])
df_process.reset_index(inplace=True)
df_process = df_process.drop(columns = 'index')

In [None]:
df_process.shape

In [None]:
df_process.head()

## Map GO ids to descriptive names

In [None]:
## code copied from Gene Ontology (GO) Biological Process by Moshe Silverstein 
lst = []

for index in df_process.index:
    lst.append(str(digraph.nodes[df_process.loc[index, 'GO ID']]['name'])+' '+ '('+str(df_process.loc[index, 'GO ID'])+')') 
    
df_process['GO ID'] = lst

In [None]:
df_process.head()

In [None]:
df_process.drop_duplicates(inplace=True)

In [None]:
df_process.head()

In [None]:
df_process.reset_index(inplace=True)

In [None]:
df_process.head()

In [None]:
df_process = df_process.drop(columns = 'index')

In [None]:
df_process.head()

In [None]:
df_process.shape

In [None]:
number_terms = len(df_process.iloc[:, 1].unique())
number_terms

## Build the protein-coding gene reference list

In [None]:
hs_geneInfo = pd.read_csv("https://ftp.ncbi.nlm.nih.gov/gene/DATA/GENE_INFO/Mammalia/Homo_sapiens.gene_info.gz", sep="\t")

In [None]:
hs_geneInfo.head()

In [None]:
hs_geneInfo.shape

In [None]:
hs_proteinCoding = hs_geneInfo[hs_geneInfo["type_of_gene"] == "protein-coding"]

In [None]:
hs_proteinCoding.shape

## Keep gene symbols that are in the Homo sapience table

In [None]:
df_process_protein = df_process.loc[df_process["Gene symbol"].isin(hs_proteinCoding["Symbol"].unique())]

In [None]:
df_process_protein.shape

In [None]:
filename = 'GO_Biological_Process_2021.gmt'
file = open(filename,'w') 
terms = df_process_protein["GO ID"].unique()
lines = []
t = '\t'
for term in terms:
    df_byTerm = df_process_protein.loc[df_process_protein["GO ID"] == term]
    
    if df_byTerm.shape[0] > 4:
        # split splice variant names
        df_byTerm["Gene symbol"] = df_byTerm["Gene symbol"].apply(lambda x: x.split('.')[0])        
        
        if len(df_byTerm["Gene symbol"].unique()) > 4:
            lines.append('{}\t\t{}'.format(term, "\t".join(df_byTerm["Gene symbol"].unique())))

file.write('\n'.join(lines))
file.close()