# SynGO Harmonizome Processing Notebook
This notebook contains the processing scripts used to prepare the SynGO dataset for Harmonizome. The SynGO 1.2 release data [SynGO_bulk_download_release_20231201.zip](https://www.syngoportal.org/data/SynGO_bulk_download_release_20231201.zip) was downloaded from the [SynGO Portal](https://www.syngoportal.org/).

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-process Data

In [None]:
syngo = pd.read_excel('SynGO_bulk_download_release_20231201/syngo_annotations.xlsx')
syngo

In [None]:
syngo = syngo.get(['hgnc_symbol', 'go_name', 'go_id', 'go_domain'])
syngo

## Map Genes to Approved and Up-To-Date Symbols

In [None]:
mapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None)
mapping[1] = mapping[1].apply(str).apply(str.upper)
mapping[2] = mapping[2].apply(str).apply(str.upper)
mapping = mapping.set_index(1)[2].to_dict()
mapping

In [None]:
syngo['hgnc_symbol'] = syngo['hgnc_symbol'].apply(str.split, sep=';')
syngo = syngo.explode('hgnc_symbol')
syngo['hgnc_symbol'] = syngo['hgnc_symbol'].map(mapping)
syngo

In [None]:
syngo = syngo.drop_duplicates().reset_index(drop=True)
syngo

In [None]:
print(f'{len(syngo["hgnc_symbol"].unique())} genes, {len(syngo["go_name"].unique())} terms, {syngo.shape[0]} annotations')

### Enrichr GMT Creation

In [None]:
with open('downloads/SynGO_2024.txt', 'w') as f:
    binaryMatrix = pd.crosstab(syngo['hgnc_symbol'], syngo['go_name'], values=1, aggfunc=max).replace(np.nan, 0)
    arr = binaryMatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binaryMatrix.columns
    domains = syngo.set_index('go_name')['go_domain'].to_dict()
    

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binaryMatrix.index[arr[:, i] == 1]])>= 5:
            term = f'{attributes[i]} {domains[attributes[i]]}'
            print(term, '', *binaryMatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

## Process Data for SQL

### Resource

In [None]:
#(id, name, long_description, short_description, url, num_attributes, num_datasets)
('112', 'SynGO', 'a public knowledgebase for synapse research providing synapse ontologies, synaptic gene/protein annotations, and online analysis and visualization tools', 'An evidence-based, expert-curated resource for synapse function and gene enrichment studies', 'https://www.syngoportal.org/', 267, 1)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(154, 'SynGO Synaptic Gene Annotations', 'Synaptic Gene Annotations', 'curated annotations of genes with synaptic terms', 'gene-biological terms associations from curated gene annotations', 'genes associated with the synaptic term {0} from the SynGO Synaptic Gene Annotations dataset.', 'sets of genes associated with synaptic terms from the SynGO Synaptic Gene Annotations dataset.', 'synaptic terms associated with {0} gene from the SynGO Synaptic Gene Annotations dataset.', 0, 0, '2024-03-04', 'syngo', 0, 112, 4, 6, 18, 4, 'association by literature curation', 'curated literature', 'low throughput, hypothesis driven', 'biological terms', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviateion, year, title, volume, pages)
(153, 'Koopmans, F et al. (2019) SynGO: An Evidence-Based, Expert-Curated Knowledge Base for the Synapse. Neuron. 103:217-34', 'Koopmans, Neuron, 2019', 'dx.doi.org/10.1016/j.neuron.2019.05.002', 31171447, 'https://pubmed.ncbi.nlm.nih.gov/31171447/', 'Koopmans', 'F', 'Neuron', 2019, 'SynGO: An Evidence-Based, Expert-Curated Knowledge Base for the Synapse', 103, '217-34')
(235, 154, 153)

### Naming Authority

In [None]:
#(id, name, description, url, publication_fk)
(106, 'SynGO', 'An evidence-based, expert-curated resource for synapse function and gene enrichment studies', 'https://www.syngoportal.org/', 153)

### Attributes

In [None]:
attributes = pd.read_csv('../../tables/attribute.csv')
attributes

In [None]:
syngo['go_id'] = syngo['go_id'].apply(lambda x: x.replace(':','_'))
terms = attributes[attributes['naming_authority_fk']==9]['name_from_naming_authority'].tolist()

In [None]:
attributefks = attributes[attributes['naming_authority_fk']==9].set_index('name_from_naming_authority')['id'].to_dict()
attributefks

In [None]:
index = 394173
namingauth = {'GO':9, 'SYNGO':106}
synapseterms = syngo[['go_name', 'go_id']].drop_duplicates()
for i in synapseterms.index:
    i = synapseterms.loc[i]
    if i['go_name'].split(' (')[0] not in terms:
        attname = i['go_name'].split(' (')[0]
        attid = i['go_id']
        print((index, attname, attid, namingauth[attid.split('_')[0]]), end=',\n')
        attributefks[attname] = index
        index += 1

### Gene Sets

In [None]:
index = 135500000
genesetfks = {}
for i in synapseterms.index:
    i = synapseterms.loc[i]
    attname = i['go_name'].split(' (')[0]
    attid = i['go_id']
    print((index, attname, attid, 154, 18, attributefks[attname], 0), end=',\n')
    genesetfks[attname] = index
    index += 1

### Associations

In [None]:
genes = pd.read_csv('../../tables/gene.csv', index_col='symbol')
genefks = genes['id'].to_dict()

associations = syngo.copy()
associations['hgnc_symbol'] = associations['hgnc_symbol'].apply(lambda x: genefks[x])
associations['go_name'] = associations['go_name'].apply(lambda x: genesetfks[x.split(' (')[0]])
associations = associations[['hgnc_symbol', 'go_name']]
associations.columns = ['gene_fk', 'gene_set_fk']
associations['threshold_value'] = 1
associations = associations.rename_axis('id')
associations.index += 41000000
associations.to_csv('../../harmonizome-update/syngo.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'
syngo.columns = ['Gene', 'Term', 'Term ID', 'Term Domain']
syngo['Term'] = syngo['Term'].apply(lambda x: x.split(' (')[0])
syngo['Threshold'] = 1
syngo

### Gene-Attribute Matrix

In [None]:
binaryMatrix = pd.crosstab(syngo['Gene'], syngo['Term'], values=syngo['Threshold'], aggfunc=max).replace(np.nan, 0)
binaryMatrixT = binaryMatrix.T
binaryMatrix.to_csv(f'{output_path}gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binaryMatrix

### Gene-Attribute Edge List

In [None]:
edgelist = syngo.copy()
geneids = genes['ncbi_entrez_gene_id'].to_dict()
edgelist['Gene ID'] = edgelist['Gene'].map(geneids)
edgelist = edgelist[['Gene', 'Gene ID', 'Term', 'Term ID', 'Threshold']]
edgelist

### Gene List

In [None]:
genelist = edgelist[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
genelist.to_csv(f'{output_path}gene_list_terms.txt.gz', sep='\t', compression='gzip')
genelist

### Attribute List

In [None]:
attributelist = edgelist[['Term', 'Term ID']].drop_duplicates().reset_index(drop=True)
attributelist.to_csv(f'{output_path}attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributelist

### Gene Set Library

In [None]:
with open(f'{output_path}gene_set_library_crisp.gmt', 'w') as f:
    arr = binaryMatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binaryMatrix.columns
    termids = syngo.set_index('Term')['Term ID'].to_dict()
    domains = syngo.set_index('Term')['Term Domain'].to_dict()

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binaryMatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], termids[attributes[i]], *binaryMatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binaryMatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binaryMatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binaryMatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binaryMatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binaryMatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binaryMatrix.index, columns=binaryMatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binaryMatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binaryMatrixT.index, columns=binaryMatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graaph Serialization

In [None]:
nodes = {}
edges = []

for gene in genelist.index:
    gene = genelist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for term in attributelist.index:
    term = attributelist.loc[term]
    nodes[term['Term ID']] = {
        "type": {'BP': 'biological process', 'CC':'cellular component'}[domains[term['Term']]],
        "properties": {
            "label":term['Term'],
            "id":term['Term ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": {'BP':'participates in','CC':'contained in'}[domains[edge['Term']]],
        "target": edge['Term ID'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Term ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_id":edge['Term ID'],
            "target_label":edge['Term'],
            "directed":True,
            "threshold":int(edge['Threshold'])
        }})

In [None]:
nodes

In [None]:
error

### RDF

In [None]:
with open(f'{output_path}kg_serializations/syngo.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix GO: amigo.geneontology.org/amigo/term/GO:', file=f)
    print('', file=f)
    relations = {'biological process':'RO:0000056', 'cellular component':'RO:0001018'}
    for edge in edges:
        print(
            'gene:'+str(edge['properties']['source_id']), 
            relations[nodes[edge['target']]['type']], 
            edge['properties']['target_id'].replace('_',':', 1), end=' .\n', 
        file=f)

### JSON

In [None]:
with open(f'{output_path}kg_serializations/syngo.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

### TSV

#### Nodes

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
def namespace(nodeid):
    nodeid = str(nodeid).split('_')[0]
    if 'SYNGO' in nodeid:
        return 'SynGO'
    elif 'GO' in nodeid:
        return 'GO'
    return 'NCBI Entrez'
nodeframe['namespace'] = nodeframe['id'].apply(namespace)
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(f'{output_path}kg_serializations/syngo_tsv/nodes.tsv', sep='\t')
nodeframe

#### Edges

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(f'{output_path}kg_serializations/syngo_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heat Map

In [None]:
sns.clustermap(binaryMatrix, cmap='seismic', center=0, figsize=(25,25))

### Gene Similarity Clustered Heat Map

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0, figsize=(25,25))

### Attribute Similarity Clustered Heat Map

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, figsize=(25,25))

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the SynGO Synaptic Gene Annotations Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=5,mindist=0.01
    ,spread=0.25, 
    #maxdf=1.0, 
    #mindf=36
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}umap.html", title = 'Gene Sets in the SynGO Synaptic Gene Annotations Library')
save(plot)