# WikiPathways Pathways 2024 Harmonizome Processing
This notebook contains the processing scripts for the WikiPathways Pathways 2024 dataset for Harmonizome. The human GMT was downloaded from the [WikiPathways data repository](https://data.wikipathways.org/current/gmt) on 9/23/24. Gene symbols were mapped to approved and up-to-date gene symbols.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm
from collections import OrderedDict

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Preprocess Data

In [None]:
def load_gmt(file):
    gmt = {}
    for line in file:
        #if len(line.strip().split('\t')) < 3:
            #continue
        term, url, *geneset = line.strip().split('\t')
        pathway, version, wpid, species = term.strip().split('%')
        gmt[pathway] = {'wpid':wpid, 'geneset':set(geneset)}
    return gmt

In [None]:
pathways = load_gmt(open('wikipathways-20240910-gmt-Homo_sapiens.gmt', 'r'))
pathways = pd.DataFrame(pathways).T.explode('geneset').reset_index()
pathways.columns = ['pathway','wpid', 'geneid']
pathways['geneid'] = pathways['geneid'].astype(int)
print(len(pathways['wpid'].unique()), 'pathways', len(pathways['geneid'].unique()), 'genes')
pathways

## Process Data

In [None]:
geneids = pd.read_csv('../../../mapping/mappingFiles/GeneSymbolsAndIDS_2024.tsv', sep='\t')
geneids = geneids[geneids['#tax_id']==9606].set_index('GeneID')['Symbol'].to_dict()

In [None]:
pathways['gene'] = pathways['geneid'].map(geneids)
pathways = pathways.dropna()
print(len(pathways['wpid'].unique()), 'pathways', len(pathways['geneid'].unique()), 'genes')
pathways

## Prepare Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(158, 'Wikipathways Pathways 2024', 'Pathways 2024', 'sets of proteins participating in pathways from Wikipathways updated for 2024', 'protein-pathway associations from curated pathways', 'proteins participating in the {0} pathway from the Wikipathways Pathways 2024 dataset.', 'sets of proteins participating in pathways from the Wikipathways Pathways 2024 dataset.', 'pathways involving {0} protein from the Wikipathways Pathways 2024 dataset.', 0, 0, '2024-09-25', 'wikipathways24', 0, 75, 4, 6, 22, 4, 'association by literature curation', 'curated literature', 'low throughput, hypothesis driven', 'pathways')

### Publication

In [None]:
#(id, dataset_fk, publication_fk)
(240, 158, 156)

### Gene

In [None]:
dbgenes = pd.read_csv('../../../tables/gene.csv')#.drop_duplicates(subset='ncbi_entrez_gene_id')
dbgeneids = dbgenes['ncbi_entrez_gene_id'].tolist()
dbgenesymbols = dbgenes['symbol'].tolist()
genefks = dbgenes.set_index('symbol')['id'].to_dict()
geneids = dbgenes.set_index('symbol')['ncbi_entrez_gene_id'].to_dict()
genes = dbgenes.set_index('ncbi_entrez_gene_id')['id'].to_dict()
dbgenes

In [None]:
genesymbols = dbgenes['symbol'].to_list()
genelist = dbgenes['ncbi_entrez_gene_id'].to_list()
geneids = pd.read_csv('../../../mapping/MappingFiles/GeneSymbolsAndIDS_2024.tsv', sep='\t').set_index('Symbol')['GeneID'].to_dict()
genedescs = pd.read_csv('../../../mapping/source_files/human_gene_info', sep='\t').get(['GeneID', 'description']).set_index('GeneID')['description'].to_dict()
genefks = dbgenes.set_index('ncbi_entrez_gene_id')['id'].to_dict()

In [None]:
index = 58402
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

pathwaygenes = pathways[['gene', 'geneid']].drop_duplicates()
genes = dbgenes.set_index('ncbi_entrez_gene_id')['symbol'].to_dict()

print('add:')
for gene in pathwaygenes.index:
    gene = pathwaygenes.loc[gene]
    symbol = gene['gene'].upper()
    geneid = gene['geneid']
    if geneid not in genes:
        print((index, symbol, geneid, genedescs[geneid], geneurl+str(geneid)), end=',\n')
        genefks[geneid] = index
        index += 1


print('update:')
for gene in pathwaygenes.index:
    gene = pathwaygenes.loc[gene]
    symbol = gene['gene'].upper()
    geneid = gene['geneid']
    if geneid in genes and genes[geneid]!=symbol:
        print(
f'''UPDATE gene SET
    symbol="{symbol}",
    name="{genedescs[geneid]}"
WHERE ncbi_entrez_gene_id={geneid}''', end=';\n\n'
        )

### Attribute

In [None]:
attributes

In [None]:
attributes = pd.read_csv('../../../tables/attribute.csv', dtype={'id':int, 'name_from_naming_authority':str, 'id_from_naming_authority':str, 'description_from_naming_authority':str, 'url_from_naming_authority':str, 'naming_authority_fk':int})
#attributes = attributes[attributes['naming_authority_fk']==97]
attributes['name_from_naming_authority'] = attributes['name_from_naming_authority'].astype(str).apply(str.lower)
attributefks = attributes.set_index('name_from_naming_authority')['id'].to_dict()
attributelist = attributes['name_from_naming_authority'].tolist()

In [None]:
pathwayurl = 'https://www.wikipathways.org/pathways/'
pathwayids = pathways.set_index('pathway')['wpid'].to_dict()

#(id, name_from_naming_authority, naming_authority_fk)
index = 422385

for pathway in pathways['pathway'].unique():
    wpid = pathwayids[pathway]
    if pathway.lower() not in attributelist:
        print((index, pathway, wpid, pathwayurl+wpid+'.html', 97), end=',\n')
        attributefks[pathway.lower()] = index
        index += 1

### Gene Set

In [None]:
#(id, name_from_dataset, id_from_dataset, description_from_dataset, url_from_dataset, dataset_fk, attribute_type, attribute_fk)
index = 135900000
genesetfks = {}

for pathway in pathways['pathway'].unique():
    wpid = pathwayids[pathway]
    print((index, pathway, wpid, pathwayurl+wpid+'.html', 158, 22, attributefks[pathway.lower()]), end=',\n')
    genesetfks[wpid] = index
    index += 1

### Association

In [None]:
index = 45000000

associations = pathways.copy()

associations['gene_set'] = associations['wpid'].apply(lambda x: x in genesetfks)
associations = associations[associations['gene_set']==True]

associations = associations.get(['geneid', 'wpid'])
associations.columns = ['gene_fk', 'gene_set_fk']
associations['gene_fk'] = associations['gene_fk'].apply(lambda x: genefks[int(x)])
associations['gene_set_fk'] = associations['gene_set_fk'].apply(lambda x: genesetfks[x])
associations['threshold_value'] = 1
associations = associations.drop_duplicates().reset_index(drop=True)
associations.index += index
associations = associations.rename_axis('id')
associations

In [None]:
associations.to_csv('../../../harmonizome-update/wikipathways24.csv')

## Create Downloads

In [None]:
output_path = 'downloads/'

### Gene-Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(pathways['gene'], pathways['pathway'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = pathways.get(['gene', 'geneid', 'pathway', 'wpid'])
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['gene', 'geneid']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['pathway', 'wpid']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], pathwayids[attributes[i]], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i],geneids[genes[i]], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
def load_gmt(file):
    gmt = {}
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term] = {'geneset':set(geneset)}
    return gmt

gmt = pd.DataFrame(load_gmt(open('newdata/WikiPathwaysPFOCR/downloads/gene_set_library_crisp.gmt', 'r'))).T.explode('geneset').reset_index()
gmt.columns = ['pathway', 'gene']

gmtmatrix = pd.crosstab(gmt['gene'], gmt['pathway'])
gmtmatrixT = gmtmatrix.T
gmtmatrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['geneid'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['geneid']),
            "label":gene['gene']
        }}

for pathway in attributeslist.index:
    pathway = attributeslist.loc[pathway]
    nodes[pathway['wpid']] = {
        "type":"pathway",
        "properties": {
            "id":pathway['wpid'],
            "label":pathway['pathway']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['geneid']),
        "relation": "participates in",
        "target": edge['wpid'],
        "properties":{
            "id":str(edge['geneid'])+":"+edge['wpid'],
            "source_id":int(edge['geneid']),
            "source_label":edge['gene'],
            "target_label":edge['pathway'],
            "target_id":edge['wpid'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/wikipathways24.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix WP: https://www.wikipathways.org/pathways/WP', file=f)
    
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0000056', edge['properties']['target_id'].replace('WP','WP:'), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/wikipathways24.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'pathway':'Wikipathways'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/wikipathways24_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/wikipathways24_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene-Attribute Clustered Heatmap

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, figsize=(12,12))

### Gene-Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute-Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, figsize=(12,12))

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, wpid, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt

In [None]:
libdict = load_gmt(open(output_path+'gene_set_library_crisp.gmt', 'r'))
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in Wikipathways Pathways 2024 Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, 
    nneighbors=25,
    mindist=0.1,
    spread=1.3, 
    maxdf=0.9, 
    mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
# save HTML if desired
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in Wikipathways Pathways 2024 Library')
save(plot)