# Reactome Pathways 2024
This notebook contains the script used to process the Reactome Pathways 2024 library for Harmonizome. The Reactome Pathways GMT was downloaded from the Reactome [download page](https://reactome.org/download-data) on 12/2/24.

The GMT was cleaned to ensure all gene symbols matched up-to-date and approved gene symbols from NCBI Gene.

After this process, the dataset contained 125,317 associations between 2709 pathways and 11,102 proteins.

In [None]:
import pandas as pd
import datetime
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
def load_gmt(file):
    with open(file, 'r') as f:
        gmt = {}
        ids = {}
        for line in f:
            pathway, pathwayid, *geneset = line.strip().split('\t')
            gmt[pathway] = set(geneset)
            ids[pathway] = pathwayid
        return gmt, ids

In [None]:
pathways, pathwayids = load_gmt('ReactomePathways.gmt')

In [None]:
reactome = pd.DataFrame([pathways]).T.explode(0).astype(str).reset_index()
reactome.columns = ['Pathway', 'Gene']
reactome

In [None]:
reactome['Pathway'].nunique(), reactome['Gene'].nunique()

## Map Genes to Approved and Up-to-Date Gene Symbols

In [None]:
geneinfo = pd.read_csv('../../mapping/source_files/human_gene_info', sep='\t')
geneinfo = geneinfo[geneinfo['#tax_id']==9606][geneinfo['type_of_gene']=='protein-coding']
geneinfo['Synonyms'] = geneinfo['Synonyms'].apply(str.split, sep='|')
geneinfo = geneinfo.explode('Synonyms')[['GeneID', 'Symbol', 'Synonyms', 'description']]
geneinfo

In [None]:
symbols = set(geneinfo['Symbol'].tolist())
genedict = geneinfo.set_index('Synonyms')['Symbol'].to_dict()
genedict.pop('-')

for gene in symbols:
    genedict[gene] = gene

In [None]:
reactome['Gene'] = reactome['Gene'].map(genedict)
reactome = reactome.dropna().drop_duplicates()
print(reactome['Gene'].nunique(), 'genes,', reactome['Pathway'].nunique(), 'pathways')
reactome

## Prepare Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, positive_association, negative_association, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(163, 'Reactome Pathways 2024', 'Pathways 2024', 'Sets of proteins participating in pathways from Reactome, updated for 2024', 'protein-pathway associations from curated pathways', 'proteins participating in the {0} pathway from the Reactome Pathways 2024 dataset.', 'sets of proteins participating in pathways from the Reactome Pathways 2024 dataset.', 'pathways involving {0} protein from the Reactome Pathways 2024 dataset.', 0, 0, '2024-12-2', 'reactome24', 0, 65, 4, 6, 22, 4, 'association by literature curation', 'curated literature', 'low throughput, hypothesis driven', 'pathways')

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(159, 'Milacic M et al. (2024) The Reactome Pathway Knowledgebase 2024. Nucleic Acids Res. 52:D672-8', 'Milacic, Nucleic Acids Res, 2024', 'dx.doi.org/10.1093/nar/gkad1025', 37941124, 'https://ncbi.nlm.nih.gov/pubmed/37941124', 'Milacic', 'M', 'Nucleic Acids Res', 2024, 'The Reactome Pathway Knowledgebase 2024', 52, 'D672-8')

(246, 163, 159)

### Attributes

In [None]:
attributes = pd.read_csv('../../tables/attribute.txt', sep='\t')
attributes['name_from_naming_authority'] = attributes['name_from_naming_authority'].astype(str).apply(str.lower)
attributeset = set(attributes['name_from_naming_authority'])
attributefks = attributes.set_index('name_from_naming_authority')['id'].to_dict()
pathwayurl = 'https://reactome.org/PathwayBrowser/#/'

index = 437616

for pathway in reactome['Pathway'].unique():
    if pathway.lower() not in attributeset:
        pathwayid = pathwayids[pathway]
        print((index, pathway, pathwayid, pathwayurl+pathwayid, 87), end=',\n')
        attributefks[pathway.lower()] = index
        index += 1

### Gene Sets

In [None]:
index = 136400000
genesetfks = {}

for pathway in reactome['Pathway'].unique():
        pathwayid = pathwayids[pathway]
        print((index, pathway, pathwayid, pathwayurl+pathwayid, 163, 22, attributefks[pathway.lower()]), end=',\n')
        genesetfks[pathway] = index
        index += 1

### Genes

In [None]:
lookup = geneinfo.drop_duplicates('Symbol').set_index('Symbol')
genes = pd.read_csv('../../tables/gene.csv')
genefks = genes.set_index('symbol')['id'].to_dict()
geneslist = set(genes['symbol'])
geneurl = 'https://ncbi.nlm.nih.gov/gene/'
index = 58466

for gene in reactome['Gene'].unique():
    if gene.upper() not in geneslist:
        geneid = lookup.loc[gene, 'GeneID']
        print((index, gene, geneid, geneurl+str(geneid), lookup.loc[gene, 'description']), end=',\n')
        genefks[gene.upper()] = index
        index += 1

### Association

In [None]:
associations = reactome.copy()
associations['Pathway'] = associations['Pathway'].map(genesetfks)
associations['Gene'] = associations['Gene'].apply(str.upper).map(genefks)#.astype(int)
associations = associations[['Gene', 'Pathway']].drop_duplicates().reset_index(drop=True)
associations.columns = ['gene_fk', 'gene_set_fk']
associations['threshold_value'] = 1
associations.index += 50000000
associations.to_csv('../../harmonizome-update/reactome24.csv')

associations

## Create Downloads

In [None]:
output_path = 'downloads/'

In [None]:
geneiddict = lookup['GeneID'].to_dict()
reactome = reactome.drop_duplicates()
reactome['Gene ID'] = reactome['Gene'].apply(lambda x: geneiddict[x])
reactome['Pathway ID'] = reactome['Pathway'].map(pathwayids)
reactome['Threshold'] = 1
reactome = reactome[['Gene', 'Gene ID', 'Pathway', 'Pathway ID', 'Threshold']].reset_index(drop=True)
reactome

### Gene Attribute Binary Matrix

In [None]:
binarymatrix = pd.crosstab(reactome['Gene'], reactome['Pathway'], reactome['Threshold'], aggfunc=max).replace(np.nan, 0).astype(int)
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene Attribute Edge List

In [None]:
edgelist = reactome.copy()
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Pathway', 'Pathway ID']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for pathway in attributeslist.index:
    pathway = attributeslist.loc[pathway]
    nodes[pathway['Pathway ID']] = {
        "type":"pathway",
        "properties": {
            "id":pathway['Pathway ID'],
            "label":pathway['Pathway']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "participates in",
        "target": edge['Pathway ID'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['Pathway ID'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_label":edge['Pathway'],
            "target_id":edge['Pathway ID'],
            "directed":True,
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/reactome24.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix Reactome: https://reactome.org/PathwayBrowser/#/', file=f)
    
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), 'RO:0000056', 'Reactome:'+edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/reactome24.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'pathway':'Reactome'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/reactome24_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/reactome24_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_crisp.gmt', 'r'))
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the Reactome Pathways 2024 Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, 
     #nneighbors=10,
     #mindist=0.1,
     spread=1.5,
     #maxdf=0.9,
     #mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/reactome24.html", title = 'Gene Sets in the Reactome Pathways 2024 Library')
save(plot)