# WikiPathways PFOCR Harmonizome Processing

Pathway Figure OCR is an open science project dedicated to extracting pathway information from the published literature to be freely used by anyone. The data was downloaded from Wikipathways. A pathway figure-protein edgelist was computed from the downloaded gene set library. It was then manually reviewed to correct incorrectly encoded/decoded characters. Gene IDs were converted to symbols using mapping files derived from the NCBI Entrez Gene database.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Preprocess Data

In [None]:
def load_gmt(file):
    gmt = {}
    for line in file:
        term, desc, *geneset = line.strip().split('\t')
        gmt[term] = {'desc':desc, 'geneset':set(geneset)}
    return gmt

In [None]:
pfocr = load_gmt(open('newdata/WikiPathwaysPFOCR/pfocr-20210515-gmt-Homo_sapiens.gmt', 'r'))
pfocr = pd.DataFrame(pfocr).T.explode('geneset').reset_index()
pfocr.columns = ['figure','pathway', 'geneid']
pfocr

## Process Data

In [None]:
geneids = pd.read_csv('mapping/GeneSymbolsAndIDS_2023.tsv', sep='\t')
geneids.columns = ['symbol', 'id']
geneids = geneids.set_index('id')['symbol'].to_dict()

In [None]:
pfocr['gene'] = pfocr['geneid'].apply(lambda x: geneids[int(x)])
pfocr

## Prepare Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(140, 'Wikipathways Pathway Figure OCR', 'Pathway Figure OCR', 'sets of genes extracted from pathway figure using optical character recognition', 'protein-pathway associations extracted from figures in published literature', 'proteins participating in the {0} pathway from the Wikipathways PFOCR dataset.', 'sets of proteins participating in pathways from the Wikipathways PFOCR dataset.', 'pathways involving {0} protein from the Wikipathways PFOCR dataset.', 0, 0, '2023-05-25', 'pfocr', 0, 75, 4, 6, 22, 4, 'association by literature curation', 'curated literature', 'low throughput, hypothesis driven', 'pathways')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviateion, year, title, volume, pages)
(142, 'Haspers, K et al. (2020) Pathway information extracted from 25 years of pathway figures. Genome Biol. 21:273', 'Haspers, Genome Biol, 2020', 'dx.doi.org/10.1186/s13059-020-02181-2', 33168034, 'https://ncbi.nlm.nih.gov/pubmed/33168034', 'Hanspers', 'K', 'Genome Biol', 2020, 'Pathway information extracted from 25 years of pathway figures', 21, 273)

### Gene

In [None]:
genes = pd.read_csv('production/gene.csv')
genelist = genes['ncbi_entrez_gene_id'].to_list()
geneids = pd.read_csv('tables/GeneSymbolsAndIDS_2023.tsv', sep='\t').drop_duplicates('Human, Mouse, and Rat Approved Symbol').set_index('Human, Mouse, and Rat Approved Symbol')['Entrez Gene ID(supplied by NCBI)'].to_dict()
newgenes = pd.read_csv('tables/newgenes.csv', index_col=0)
newgenelist = newgenes['ncbi_entrez_gene_id'].to_list()
geneids.update(newgenes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict())
genedescs = pd.read_csv('tables/gene_info', sep='\t').get(['GeneID', 'description']).set_index('GeneID')['description'].to_dict()
genefks = genes.set_index('ncbi_entrez_gene_id')['id'].to_dict()
genefks.update(newgenes.reset_index().set_index('ncbi_entrez_gene_id')['id'].to_dict())

In [None]:
index = 57238
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

for gene in pfocr['gene'].unique():
    id = geneids[gene]
    if id not in genelist and id not in newgenelist:
        print((index, gene, id, genedescs[id], geneurl+str(id)), end=',\n')
        genefks[id] = index
        geneids[gene] = id
        index += 1

### Attribute

In [None]:
attributes = pd.read_csv('production/attribute.csv')
attributes = attributes[attributes['naming_authority_fk']==97]
attributefks = attributes.set_index('name_from_naming_authority')['id'].to_dict()
attributes = attributes['name_from_naming_authority'].to_list()

In [None]:
#(id, name_from_naming_authority, naming_authority_fk)
index = 324536

for pathway in pfocr['figure'].unique():
    if pathway not in attributes:
        print((index, pathway, 97), end=',\n')
        attributefks[pathway] = index
        index += 1

### Gene Set

In [None]:
#(id, name_from_dataset, id_from_dataset, description_from_dataset, url_from_dataset, dataset_fk, attribute_type, attribute_fk)
index = 134100000
geneseturl = 'https://pfocr.wikipathways.org/figures/'
genesets = pfocr.set_index('figure')['pathway'].to_dict()
genesetfks = {}

for pathway in pfocr['figure'].unique():
    desc = genesets[pathway]
    print((index,pathway, pathway, desc, geneseturl+pathway+'.html', 140, 22, attributefks[pathway]), end=',\n')
    genesetfks[pathway] = index
    index += 1

### Association

In [None]:
index = 20000000

associations = pfocr.copy()

associations['gene_set'] = associations['figure'].apply(lambda x: x in genesetfks)
associations = associations[associations['gene_set']==True]

associations = associations.get(['geneid', 'figure'])
associations.columns = ['gene_fk', 'gene_set_fk']
associations['gene_fk'] = associations['gene_fk'].apply(lambda x: genefks[int(x)])
associations['gene_set_fk'] = associations['gene_set_fk'].apply(lambda x: genesetfks[x])
associations['threshold_value'] = 1
associations = associations.drop_duplicates().reset_index(drop=True)
associations.index += index
associations = associations.rename_axis('id')
associations

In [None]:
associations.to_csv('harmonizome-update/wikipathwayspfocr.csv')

## Create Downloads

In [None]:
output_path = 'newdata/WikiPathwaysPFOCR/downloads/'

### Gene-Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(pfocr['gene'], pfocr['figure'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = pfocr.get(['gene', 'geneid', 'pathway', 'figure'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = pfocr.get(['gene', 'geneid']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = pfocr.get(['pathway', 'figure']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix
Large versions of the attribute similarity matrix caused crashes during creation. Therefore, a smaller version of the attribute similarity matrix was created, including only gene sets with length >= 5.

In [None]:
def load_gmt(file):
    gmt = {}
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = {'geneset':set(geneset)}
    return gmt

gmt = pd.DataFrame(load_gmt(open('newdata/WikiPathwaysPFOCR/downloads/gene_set_library_crisp.gmt', 'r'))).T.explode('geneset').reset_index()
gmt.columns = ['pathway', 'gene']

gmtmatrix = pd.crosstab(gmt['gene'], gmt['pathway'])
gmtmatrixT = gmtmatrix.T
gmtmatrix

In [None]:
attribute_similarity_matrix = dist.pdist(gmtmatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=gmtmatrixT.index, columns=gmtmatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[gene['geneid']] = {
        "type":"gene",
        "properties": {
            "id":int(gene['geneid']),
            "label":gene['gene']
        }}

for figure in attributeslist.index:
    figure = attributeslist.loc[figure]
    nodes[figure['figure']] = {
        "type":"pathway",
        "properties": {
            "id":figure['figure'],
            "label":figure['pathway']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": edge['geneid'],
        "relation": "participates in",
        "target": edge['figure'],
        "properties":{
            "id":str(edge['geneid'])+":"+edge['figure'],
            "source_id":int(edge['geneid']),
            "source_label":edge['gene'],
            "target_label":edge['pathway'],
            "target_id":edge['figure'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/pfocr.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix GO: amigo.geneontology.org/amigo/term/GO:', file=f)
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0000056', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/pfocr.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'pathway':'Wikipathways'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/wikipathwayspfocr_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/wikipathwayspfocr_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene-Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustered Heatmap
Larger versions of the attribute similarity matrix crashed when attempting to compute a clustered heatmap. Therefore, a new attribute similarity matrix was created including only gene sets with length >= 10.

In [None]:
output_path='newdata/WikiPathwaysPFOCR/downloads/'
binarymatrix = pd.read_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip', index_col='gene')

In [None]:
with open(output_path+'gene_set_library_crisp_10.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 10:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

In [None]:
def load_gmt(file):
    gmt = {}
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = {'geneset':set(geneset)}
    return gmt

gmt = pd.DataFrame(load_gmt(open('newdata/WikiPathwaysPFOCR/downloads/gene_set_library_crisp_10.gmt', 'r'))).T.explode('geneset').reset_index()
gmt.columns = ['pathway', 'gene']

gmtmatrix = pd.crosstab(gmt['gene'], gmt['pathway'])
gmtmatrixT = gmtmatrix.T
gmtmatrix

In [None]:
attribute_similarity_matrix = dist.pdist(gmtmatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=gmtmatrixT.index, columns=gmtmatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open('newdata/WikiPathwaysPFOCR/downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'newdata/WikiPathwaysPFOCR/images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in Wikipathways PFOCR Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=15,mindist=0.1
    # spread=0.8, 
    # maxdf=0.5, 
    # mindf=10
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in Wikipathways PFOCR 2023 Library')
save(plot)