# Gene Ontology Biological Process Annotations 2023
This notebook contains the script used to process the Gene Ontology Biological Process Annotations 2023 dataset for Harmonizome. The annotations file was downloded from [Gene Ontology](https://geneontology.org/docs/download-go-annotations/). The GO ontology tree was also used to only keep terms with depth ≥4, which were then mapped to descriptive names. Finally, genes were mapped to up-to-date and approved gene symbols.

In all, this dataset contains 198,050 associations between 14,811 genes and 12,318 biological processes.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn
import sys
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Preprocess Data

In [None]:
goannotations = pd.read_csv('newdata/GeneOntology/goa_human.gaf', sep='\t', skiprows=41, header=None, index_col=False, dtype=object)
goannotations.columns = ['DB', 'DB Object ID', 'DB Object Symbol', 'Qualifier', 'GO ID', 'DB:Reference', 'Evidence Code', 
                        'With (or) From', 'Aspect', 'DB Object Name', 'DB Object Synonym', 'DB Object Type', 'Taxon', 
                        'Date', 'Assigned By', 'Annotation Extension', 'Gene Product Form ID']

# Filter out associations inferred from electronic annotation and qualifiers containing 'NOT'
goannotations = goannotations[~goannotations['Qualifier'].isin(['NOT'])][~goannotations['Evidence Code'].isin(['IEA'])]
goannotations

In [None]:
bioproc = goannotations[goannotations['Aspect']=='P'].reset_index(drop=True).get(['DB Object Symbol', 'GO ID'])
bioproc

## Process Data

### Load Gene Ontology Tree Diagraph

In [None]:
digraph = goenrich.obo.ontology('newdata/GeneOntology/go-basic.obo')

### Keep Only Terms with Tree Depth >= 4

In [None]:
lst = []

for i in tqdm(bioproc.index):
    
    term = bioproc.loc[i, 'GO ID']
    if term in digraph.nodes:
        if digraph.nodes[term]['depth'] >= 4:
            lst.append(term)
        else:
            lst.append(np.nan)
    else:
        lst.append(np.nan)

bioproc['GO ID'] = lst
bioproc = bioproc.dropna()
bioproc

### Propagate Child Gene-Term Relations to Parent Terms

In [None]:
lst1 = []
lst2 = []

for i in tqdm(bioproc.index):
    term = bioproc.loc[i, 'GO ID']
    for parent in digraph.successors(term):
        if parent in digraph.nodes:
            if digraph.nodes[parent]['depth'] >= 4:
                lst1.append(bioproc.loc[i, 'DB Object Symbol'])
                lst2.append(parent)

temp = pd.DataFrame()
temp['DB Object Symbol'] = lst1
temp['GO ID']  = lst2
bioproc = pd.concat([bioproc, temp])
bioproc = bioproc.reset_index(drop=True)
bioproc

### Map GO ID to Descriptive Name

In [None]:
lst = []

for i in tqdm(bioproc.index):
    lst.append(str(digraph.nodes[bioproc.loc[i, 'GO ID']]['name']))
    
bioproc['GO Biological Process'] = lst
bioproc

### Map Gene Symbols to Up-to-date Approved Gene Symbols

In [None]:
genesymbols = pd.read_csv('tables/mappingFile_2023.tsv', sep='\t', header=None, index_col=False).dropna().set_index([0,1]).sort_index(level=[0,1])
taxon_id = 9606

lst1 = []

for i in tqdm(bioproc.index):

    if (taxon_id, bioproc.loc[i, 'DB Object Symbol']) in genesymbols.index:
        lst1.append(genesymbols.loc[(taxon_id, bioproc.loc[i, 'DB Object Symbol']), 2])
    else:
        lst1.append(np.nan)

bioproc['DB Object Symbol'] = lst1


bioproc = bioproc.dropna(subset=['DB Object Symbol']).drop_duplicates().reset_index(drop=True)
bioproc.columns = ['Gene Symbol', 'GO ID', 'GO Biological Process']
bioproc

## Prepare Data For SQL Ingestion

### Dataset

In [None]:
#(name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous, last_updated, directory, num_page views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(137, 'GO Biological Process Annotations 2023', 'Biological Process Annotations 2023', 'curated annotations of genes with biological processes', 'gene-biological process associations from curated gene annotations', 'genes participating in the {0} biological process from the curated GO Biological Process Annotations 2023 dataset.', 'sets of genes participating in biological processes from the curated GO Biological Process Annotations 2023 dataset.', 'biological processes involving {0} gene from the curated GO Biological Process Annotations2023 dataset.', 0, 0, '2023-05-12', 'gobp23', 0, 24, 4, 6, 17, 4, 'association by literature curation', 'curated literature', 'low throughput, hypothesis driven', 'biological processes')

### Publication

In [None]:
#(long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviateion, year, title, volume, pages)
(140, 'Gene, Ontology Consortium et al. (2017)  Expansion of the Gene Ontology knowledgebase and resources. Nucleic Acids Res. 45:D331-8', 'Gene, Nucleic Acids Res, 2017.', 'dx.doi.org/10.1093/nar/gkw1108', 27899567, 'http://www.ncbi.nlm.nih.gov/pubmed/27899567', 'Gene', 'Ontology Consortium', 'Nucleic Acids Res', 2017, 'Expansion of the Gene Ontology knowledgebase and resources', 45, 'D331-8')
(141, 'Gene, Ontology Consortium et al. (2019) The Gene Ontology Resource: 20 years and still GOing strong. Nucleic Acids Res. 47:D330-8', 'Gene, Nucleic Acids Res, 2019', 'dx.doi.org/10.1093/nar/gky1055', 30395331, 'http://www.ncbi.nlm.nih.gov/pubmed/30395331', 'Gene', 'Ontology Consortium', 'Nucleic Acids res', 2019, 'The Gene Ontology Resource: 20 years and still GOing strong', 47, 'D330-8')

### Gene

In [None]:
genes = pd.read_csv('production/gene.csv')
genelist = genes['ncbi_entrez_gene_id'].to_list()
geneids = pd.read_csv('tables/GeneSymbolsAndIDS_2023.tsv', sep='\t').drop_duplicates('Human, Mouse, and Rat Approved Symbol').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()
newgenes = pd.read_csv('tables/newgenes.csv', index_col=0, header=None)
newgenelist = newgenes.get(2).to_list()
geneids.update(newgenes.set_index(1)[2].to_dict())
genedescs = pd.read_csv('tables/gene_info', sep='\t').get(['GeneID', 'description']).set_index('GeneID')['description'].to_dict()
genefks = genes.set_index('ncbi_entrez_gene_id')['id'].to_dict()
genefks.update(newgenes.reset_index().set_index(2)[0].to_dict())

In [None]:
index = 57231
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

for gene in bioproc['Gene Symbol'].unique():
    id = geneids[gene]
    if id not in genelist and id not in newgenelist:
        print((index, gene, id, genedescs[id], geneurl+str(id)), end=',\n')
        genefks[id] = index
        geneids[gene] = id
        index += 1

### Attribute

In [None]:
attributes = pd.read_csv('production/attribute.csv')
attributes = attributes[attributes['naming_authority_fk']==9]
attributeids = attributes['id_from_naming_authority'].to_list()
attributefks = attributes.set_index('id_from_naming_authority')['id'].to_dict()

In [None]:
#(id, name_from_naming_authority, id_from_naming_authority, url_from_naming_authority, naming_authority_fk)
index = 320901
attributeurl = 'http://purl.obolibrary.org/obo/'


for process in bioproc['GO ID'].unique():
    id = process.replace('GO:', 'GO_')
    if id not in attributeids:
        print((index, digraph.nodes()[process]['name'], id, attributeurl+id, 9), end=',\n')
        attributefks[id] = index
        index += 1

### Gene Set

In [None]:
#(id, name_from_dataset, id_from_dataset, url_from_dataset, dataset_fk, attribute_type, attribute_fk)
index = 133800000
geneseturl = 'http://amigo.geneontology.org/amigo/term/'
genesetfks = {}

for process in bioproc['GO ID'].unique():
    id = process.replace('GO:', 'GO_')
    print((index, digraph.nodes()[process]['name'], id, geneseturl+process, 137, 17, attributefks[id]), end=',\n')
    genesetfks[id] = index
    index += 1

### Association

In [None]:
index = 17000000

associations = bioproc.copy()
associations['Gene Symbol'] = associations['Gene Symbol'].apply(lambda x: geneids[x]).apply(lambda x: genefks[x])
associations['GO ID'] = associations['GO ID'].apply(lambda x: genesetfks[x.replace(':', '_')])

associations = associations.get(['Gene Symbol', 'GO ID'])
associations.columns = ['gene_fk', 'gene_set_fk']
associations['threshold_value'] = 1
associations.index += index
associations = associations.rename_axis('id')
associations

In [None]:
associations.to_csv('harmonizome-update/gobioproc23.csv')

## Create Downloads

In [None]:
output_path = 'newdata/GeneOntology/bioproc23/downloads/'
bioproc['Gene ID'] = bioproc['Gene Symbol'].apply(lambda x: geneids[x])

In [None]:
bioproc

### Binary Matrix

In [None]:
binarymatrix = pd.crosstab(bioproc['Gene Symbol'], bioproc['GO Biological Process'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = bioproc.get(['Gene Symbol', 'Gene ID', 'GO Biological Process', 'GO ID'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = bioproc.get(['Gene Symbol', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = bioproc.get(['GO Biological Process', 'GO ID']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
bioprocids = bioproc.set_index('GO Biological Process')['GO ID'].to_dict()

with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i]+' ('+bioprocids[attributes[i]]+')', *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene-Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute-Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[gene['Gene ID']] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene Symbol']
        }}

for biologicalprocess in attributeslist.index:
    biologicalprocess = attributeslist.loc[biologicalprocess]
    nodes[biologicalprocess['GO ID']] = {
        "type":"biological process",
        "properties": {
            "id":biologicalprocess['GO ID'],
            "label":biologicalprocess['GO Biological Process']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": edge['Gene ID'],
        "relation": "participates in",
        "target": edge['GO ID'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['GO ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene Symbol'],
            "target_label":edge['GO Biological Process'],
            "target_id":edge['GO ID'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/gobp23.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix GO: amigo.geneontology.org/amigo/term/GO:', file=f)
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0000056', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/gobp23.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'biological process':'Gene Ontology'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/gobp23_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/gobp23_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustermap

In [None]:
binarymatrix = pd.read_csv('newdata/GeneOntology/bioproc23/downloads/gene_attribute_matrix.txt.gz', sep='\t', compression='gzip', index_col='Gene Symbol')

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, figsize=(50,50))

### Gene-Gene Similarity Clustermap

In [None]:
gene_similarity_matrix = pd.read_csv('newdata/GeneOntology/bioproc23/downloads/gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')
gene_similarity_matrix

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustermap

In [None]:
attribute_similarity_matrix = pd.read_csv('newdata/GeneOntology/bioproc23/downloads/attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')
attribute_similarity_matrix

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('newdata/GeneOntology/bioproc23/downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'newdata/GeneOntology/bioproc23/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in Gene Ontology Biological Process Annotations 2023 Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=15,mindist=0.1
    # spread=0.8, 
    # maxdf=0.5, 
    # mindf=10
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in Gene Ontology Biological Process Annotations 2023 Library')
save(plot)