# GlyGen Glycosylated Proteins 2023
This notebook contains the code used to process the GlyGen Glycosylated Proteins dataset for Harmonizome. 3 hunan proteoform glycosylation site citation datasets were combined from the [GlyGen Data Portal](https://data.glygen.org) to create an edgelist of 2128 human proteins glycosylated by 1910 glycans.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-process Data

In [None]:
unicarb = pd.read_csv('human_proteoform_citations_glycosylation_sites_unicarbkb.csv')
harvard = pd.read_csv('human_proteoform_citations_glycosylation_sites_harvard.csv')
glyconnect = pd.read_csv('human_proteoform_citations_glycosylation_sites_glyconnect.csv')
glygen = pd.concat([unicarb, harvard, glyconnect])
glygen

In [None]:
unicarb.shape, harvard.shape, glyconnect.shape

In [None]:
len(glygen['uniprotkb_canonical_ac'].unique()), len(glygen['glytoucan_ac'].unique())

In [None]:
glygen = glygen.get(['uniprotkb_canonical_ac', 'glytoucan_ac']).drop_duplicates().dropna()
glygen

### Map Genes to Approved and Up-to-date NCBI Symbols and IDs

In [None]:
uniprotgenes = pd.read_csv('human_protein_masterlist.csv', index_col='uniprotkb_canonical_ac')['gene_name'].to_dict()
glygen['uniprotkb_canonical_ac'] = glygen['uniprotkb_canonical_ac'].apply(lambda x: uniprotgenes[x])
glygen.columns = ['Gene', 'Glycan']
glygen

In [None]:
genemapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', header=None, sep='\t').set_index(1)[2].to_dict()
glygen = glygen[glygen['Gene'].apply(lambda x: x in genemapping)]
glygen['Gene'] = glygen['Gene'].apply(lambda x: genemapping[x])
glygen = glygen.drop_duplicates().reset_index(drop=True)
glygen['Gene']= glygen['Gene'].apply(str.upper)
glygen

In [None]:
len(glygen['Gene'].unique()), len(glygen['Glycan'].unique())

In [None]:
glygen.groupby('Glycan').describe()['Gene']['count'].mean()

## Process Data for SQL Ingestion

### Resource

In [None]:
#(id, name, long_description, short_description, url, num_attributes, num_datasets)
(108, 'GlyGen', 'GlyGen is a data integration and dissemination project for carbohydrate and glycoconjugate related data.', 'Computational and Informatics Resources for Glycoscience', 'https://glygen.org', 2371, 1)

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(147, 'GlyGen Glycosylated Proteins', 'Glycosylated Proteins', 'proteins glycosylated by saccharide ligands from glycosylation site citations', 'protein-ligand (chemical) associations curated from protein association studies', 'proteins glycosylated by {0} ligand (chemical) from the GlyGen Glycosylated Proteins dataset.', 'sets of proteins glycosylated by ligands (chemical) from the GlyGen Glycosylated Proteins dataset.', 'ligands (chemical) binding {0} protein from the GlyGen Glycosylated Proteins dataset.', 0, 0, '2023-10-16', 'glygen', 0, 108, 4, 5, 9, 2, 'association by literature curation', 'curated literature', 'low throughput, hypothesis riven', 'ligands (chemical', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(146, 'York, WS et al. (2020) GlyGen: Computational and Informatics Resources for Glycoscience. Glycobiol. 30:72-3.', 'York, Glycobiol, 2020', 'dx.doi.org/10.1093/glycob/cwz080', 31616925, 'https://www.ncbi.nlm.nih.gov/pubmed/31616925', 'York', 'WS', 'Glycobiol', 2020, 'GlyGen: Computational and Informatics Resources for Glycoscience', 30, '72-3')

### Naming Authority

In [None]:
#(id, name, description, url, publication_fk)
(104, 'GlyGen', 'GlyGen is a data integration and dissemination project for carbohydrate and glycoconjugate related data.', 'https://glygen.org', 146)

### Gene

In [None]:
index = 57541

genes = pd.read_csv('../../tables/gene.csv')
genefks = genes.set_index('symbol')['id'].to_dict()
genelist = genes['symbol'].apply(str.upper).to_list()
geneinfo = pd.read_csv('../../tables/gene_info', sep='\t', index_col='Symbol').get(['GeneID', 'description'])
for gene in glygen['Gene'].unique():
    gene = gene.upper()
    if gene not in genelist:
        print((index, gene, geneinfo.loc[gene, 'GeneID'], geneinfo.loc[gene, 'description'], 'https://ncbi.nlm.nih.gov/gene/'+str(geneinfo.loc[gene, 'GeneID'])), end=',\n')
        genefks[gene] = index
        index+=1

### Attribute

In [None]:
index = 366001
attributefks = {}
for glycan in glygen['Glycan'].unique():
    print((index, glycan, 'https://glygen.org/glycan'+glycan, 104), end=',\n')
    attributefks[glycan] = index
    index += 1

### Gene Set

In [None]:
index = 134800000
genesetfks = {}
for glycan in glygen['Glycan'].unique():
    print((index, glycan, 'https://glygen.org/glycan/'+glycan, 147, 9, attributefks[glycan]), end=',\n')
    genesetfks[glycan] = index
    index += 1

### Association

In [None]:
associations = glygen.copy()
associations['Gene'] = associations['Gene'].apply(lambda x: genefks[x])
associations['Glycan'] = associations['Glycan'].apply(lambda x: genesetfks[x])
associations['threshold'] = 1
associations.columns = ['gene_fk', 'gene_set_fk', 'threshold_value']
associations.index += 29000000
associations.to_csv('../../harmonizome-update/glygen.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'
glygen['threshold'] = 1
glygen

### Gene Attribute Ternary Matrix

In [None]:
binarymatrix = pd.crosstab(glygen['Gene'], glygen['Glycan'], 1, aggfunc=max).replace(np.nan, 0)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene Attribute Edge List

In [None]:
geneinfo.index = geneinfo.index.map(str.upper)
geneids = geneinfo['GeneID'].to_dict()
edgelist = glygen.copy()
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: geneids[x])
edgelist = edgelist.get(['Gene', 'Gene ID', 'Glycan', 'threshold'])
edgelist.columns = ['Gene', 'Gene ID', 'Glycan', 'Threshold Value']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Glycan']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for glycan in attributeslist.index:
    glycan = attributeslist.loc[glycan]
    nodes[glycan['Glycan']] = {
        "type":"glycan",
        "properties": {
            "label":glycan['Glycan'],
            "id":glycan['Glycan']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": edge['Glycan'],
        "relation": "molecularly interacts with",
        "target": int(edge['Gene ID']),
        "properties":{
            "id":edge['Glycan']+':'+str(edge['Gene ID']),
            "source_id":edge['Glycan'],
            "source_label":edge['Glycan'],
            "target_id":int(edge['Gene ID']),
            "target_label":edge['Gene'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/glygen.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix GlyGen: glygen.org/glycan/', file=f)

    print('', file=f)
    for edge in edges:
        print('GlyGen:'+edge['properties']['source_id'], 'RO:0002436', 'gene:'+str(edge['properties']['target_id']), end=' .\n', file=f)
    

#### JSON

In [None]:
with open(output_path+'kg_serializations/glygen.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
namespace = {'gene':'NCBI Entrez', 'glycan':'GlyGen'}
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: namespace[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/glygen_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/glygen_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_crisp.gmt', 'r'))

scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the GlyGen Glycosylated Proteins Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=15,mindist=0.1
     #,spread=1.5 
     #,maxdf=0.8 
     #,mindf=0.2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in GlyGen Glycosylated Proteins Library')
save(plot)