# Harmonizome ETL: Reactome

### Authors
Moshe Silverstein, Charles Dai, Ido Diamant  
  
Ma’ayan Lab, Mount Sinai Center for Bioinformatics, Department of Pharmacological Sciences  
Icahn School of Medicine at Mount Sinai, New York, NY 10029 USA

[Reactome](https://reactome.org/) is a database of manually curated pathways. It provides tools for the visualization, interpretation, and analysis of pathway knowledge.  

This appyter takes data from the Reactome Pathways Gene Set and outputs files that are usable for Machine Learning and other applications. It processes the [ReactomePathways.gmt.zip](https://reactome.org/download/current/ReactomePathways.gmt.zip) file downloaded on 09-19-2022.
  
The Appyter uses the NCBI database to map the gene names to a set of approved gene symbols so that synonymous genes are mapped to the same symbol.  

The Appyter creates gene and attribute similarity matrices, which contain the Jaccard Index between any two gene sets or attribute sets.  
    
The following output files are made available for download:  
* A binary matrix
* Gene list
* Attribute list
* A gene set library: for each attribute (pathway), a list of genes that are associated with the attribute
* An attribute set library: for each gene, a list of attributes (pathways) that are associated with each gene
* Gene-gene similarity matrix
* Attribute-attribute similarity matrix
* Gene-attribute edge list: a list of gene-attribute pairs and the strength of each 
association
* Serialized data for Knowledge Graph ingestion: a list of gene and pathway nodes, and gene &rarr; pathway edges  
  
A ZIP archive containing these files is provided at the bottom of the report.

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
from datetime import date
from tqdm import tqdm
from PIL import Image

import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
import scanpy as sc
from IPython.display import FileLink, FileLinks

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

from bokeh.io import output_notebook, export_svg
from bokeh.io.export import get_screenshot_as_png
from bokeh.plotting import figure, show
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20

In [None]:
%load_ext autoreload
%autoreload 2

# Initilization

In [None]:
%%appyter hide_code
{% do SectionField(
    name= 'data',
    title= 'Upload Data',
    img='load_icon.png'
)%}

In [None]:
%%appyter code_eval
{% do DescriptionField(
    name= 'Description',
    text= 'The example below was sourced from <a href="http://reactome.org/pages/download=data/" target ="_blank">reactome.org</a>. If clicking on the example does not work, it should be downloaded directly from the source.',
    section='data'
)%}

{% set df_file = FileField(
    constrant= '.*.gmt(.zip)?',
    name= 'pathways_gene',
    label= 'Pathway Gene Set (gmt.zip)',
    default= 'ReactomePathways.gmt.zip',
    examples= {
        'ReactomePathways.gmt.zip': 'https://reactome.org/download/current/ReactomePathways.gmt.zip'
    },
    section= 'data'
)%}

In [None]:
output_notebook()

## Load and Save Mapping Dictionaries

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

## Create Output Path  
Creates a folder to store output files

In [None]:
output_name = 'reactome'

path = 'output'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

df = pd.read_csv(
    {{df_file}},
    sep='%', header= None)

In [None]:
display(df.head())
df.shape

# Pre-process Data

In [None]:
df[0], df[1] = df[0].str.split('\t').str[0] + ' ' + df[0].str.split('\t').str[1], df[0].str.split('\t').str[2:]
df.columns= ['Pathway', 'Gene Symbol']
df['Pathway'] = df['Pathway'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')

dupes = df.shape[0]
df["Genes"] = df["Gene Symbol"].apply(' '.join)
df = df.drop_duplicates("Genes")
df = df.drop(columns=["Genes"])


display(df.head())
display(df.shape)

print("Found " + str(dupes-df.shape[0]) + " gene sets with duplicate genes, keeping first instance of each.")

In [None]:
# Create copy of df to be used in statistics and histogram
geneSetLibrary = df.copy()
geneSetLibrary["Genes"] = geneSetLibrary["Gene Symbol"].apply(' '.join)
geneSetLibrary["Gene Set Length"] = geneSetLibrary["Gene Symbol"].apply(len)
geneSetLibrary = geneSetLibrary.drop(columns= ["Gene Symbol"])

display(geneSetLibrary.head())
geneSetLibrary.shape

In [None]:
df = df.explode('Gene Symbol')
df = df.set_index('Gene Symbol')
display(df.head())
df.shape

# Filter Data

## Map Gene Symbols to Up-to-Date Approved Gene Symbols

In [None]:
dupes = df.shape[0]
count = 0
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)

while (dupes-df.shape[0]-count) != 0:
    df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
    count = (dupes-df.shape[0])

display(df.head())
df.shape

print("Found " + str(dupes-df.shape[0]) + " synonymous genes, keeping first instance of each.")

# Analyze Data

## Generate Dataset Statistics

In [None]:
stats = pd.DataFrame()
stats.index = ["Genes", "Terms", "Mean Genes/Term"]
genestats = df.groupby("Gene Symbol")
pathstats = df.groupby("Pathway")
stats["Statistics"] = [len(genestats), len(pathstats), str(round(len(genestats)/len(pathstats),3))]
stats.head()

## Create Binary Matrix

In [None]:
binary_matrix = uf.binary_matrix(df)
display(binary_matrix.head())
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix',
            dtype=np.uint8)

## Create Gene List

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
display(gene_list.head())
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
             ext= 'tsv', index=False)

## Create Attribute List

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
display(attribute_list.head())
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
             ext= 'tsv')

## Create Gene Set Library and Attribute Set Library

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, output_name + '_attribute_set')

## Create Gene Similarity Matrix

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
             output_name + '_gene_similarity_matrix', 
            symmetric=True, dtype=np.float32)

## Create Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path, 
             output_name + '_attribute_similarity_matrix', 
            symmetric=True, dtype=np.float32)

## Create Gene-Attribute Edge List

In [None]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list',
             ext='tsv')

## Serialize Data for Knowledge Graph Ingestion

In [None]:
pathwayid_lookup = attribute_list.copy(deep=True).reset_index()
pathwayid_lookup['Pathway'], pathwayid_lookup['Pathway ID'] = pathwayid_lookup['index'].apply(str.split)[:].str[:-1].apply(' '.join), pathwayid_lookup['index'].apply(str.split)[:].str[-1]
pathwayid_lookup.set_index('index', inplace=True)
pathwayid_lookup = pathwayid_lookup.to_dict()

In [None]:
nodes = {}

for gene in geneid_lookup:
    nodes[gene] = {
            gene:{
                "type": "gene",
                "properties": {
                    "id": str(geneid_lookup[gene]),
                    "label": gene,
                    "URI": "https://www.ncbi.nlm.nih.gov/gene/"+str(geneid_lookup[gene])
                }
            }
        }
for pathway in pathwayid_lookup['Pathway']:
    nodes[pathwayid_lookup['Pathway'][pathway]] = {
            pathwayid_lookup['Pathway'][pathway]:{
                "type": "pathway",
                "properties": {
                    "id": str(pathwayid_lookup['Pathway ID'][pathway]),
                    "label": pathwayid_lookup['Pathway'][pathway],
                    "URI": "https://www.reactome.org/content/detail/"+str(pathwayid_lookup['Pathway ID'][pathway])
                }
            }
        }

In [None]:
edges = []
arr = binary_matrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    
w, h = arr.shape
for i in tqdm(range(h)):
    sources = [*binary_matrix.index[arr[:, i] == 1]]
    for source in sources:
        edges.append(
            {
                "source": source,
                "relation": "participates in",
                "target":pathwayid_lookup['Pathway'][str(binary_matrix.columns[i])],
                "properties": {
                    "id": str(source)+" -> "+pathwayid_lookup['Pathway'][str(binary_matrix.columns[i])],
                    "source_id": str(geneid_lookup[source]),
                    "target_id": str(pathwayid_lookup['Pathway ID'][binary_matrix.columns[i]]),
                    "directed": True
                }
            }
        )

In [None]:
with open(path + "/" + output_name + "_serialization_for_kg.json","w") as serialize:       
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes, 
            "edges": edges
        }, indent=4, fp=serialize)

# Visualize Data

## Gene Set Histogram

In [None]:
geneSetLibrary.groupby("Gene Set Length").describe()

In [None]:
group = geneSetLibrary.groupby("Gene Set Length")
source = ColumnDataSource(group)
hist = figure(plot_width = 1000, 
        plot_height=500,
        x_axis_type = "log",
        # x_range=(0,max(geneSetLibrary["Gene Set Length"])),
        y_range=(0, max(group.Pathway.count())),
        title="Gene Set Length in Reactome 2022 Library",
        x_axis_label = "Gene Set Length",
        y_axis_label = "Gene Sets")
hist.vbar(x="Gene Set Length", top = "Pathway_count", line_color="black", hover_fill_color="firebrick", 
        hover_line_color="black", hover_alpha=0.3, source=source)

hist.xaxis.axis_label_text_font_style = 'normal'
hist.xaxis.axis_label_text_font_size = '18px'
hist.yaxis.axis_label_text_font_size = '18px'
hist.yaxis.axis_label_text_font_style = 'normal'
hist.title.align = 'center'
hist.title.text_font_size = '18px'

hist.add_tools(HoverTool(tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:150px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set Length:</span>
                <span style="font-size: 12px">@{Gene Set Length}</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Count:</span>
                <span style="font-size: 12px">@Pathway_count</span>
            </div>
        </div>
    """))

show(hist)

## UMAP Visualization

In [None]:
libDict = geneSetLibrary.set_index('Pathway').to_dict()['Genes']

vec = TfidfVectorizer()
X = vec.fit_transform(libDict.values())
adata = anndata.AnnData(X, dtype='float32')
adata.obs.index = libDict.keys()

sc.pp.neighbors(adata, n_neighbors=25, use_rep='X')
sc.tl.leiden(adata)
sc.tl.umap(adata, a = 5.8, b = 0.55)

new_order = adata.obs.sort_values(by='leiden').index.tolist()
adata = adata[new_order,:]
adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

mapped_df = pd.DataFrame(adata.obsm['X_umap'])
mapped_df.columns = ['x', 'y']

mapped_df['cluster'] = adata.obs['leiden'].values
mapped_df['term'] = adata.obs.index

clusters = pd.unique(mapped_df['cluster']).tolist()
colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
color_mapper = {clusters[i]:colors[i%20] for i in range(len(clusters))}

mapped_df['color'] = mapped_df['cluster'].apply(lambda x: color_mapper[x])

mapped_df.head()

In [None]:
xlabel = 'UMAP 1'
ylabel = 'UMAP 2'

source2 = ColumnDataSource(
        data=dict(
            x = mapped_df.x,
            y = mapped_df.y,
            alpha = [0.7] * mapped_df.shape[0],
            colors = mapped_df['color'], 
            size = [6] * mapped_df.shape[0],
            gene_set = mapped_df['term'],
            cluster = mapped_df['cluster']
        )
    )

hover_emb = HoverTool(names=["df"], tooltips="""
    <div style="margin: 10">
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
            <span style="font-size: 12px">@gene_set</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
            <span style="font-size: 12px">(@x,@y)</span>
        <div style="margin: 0 auto; width:300px;">
            <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
            <span style="font-size: 12px">@cluster</span>
        </div>
    </div>
    """)

tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']
title_emb = 'Gene Sets in Reactome Library'
plot_emb = figure(plot_width=1000, plot_height=700, tools=tools_emb, title=title_emb, x_axis_label=xlabel, y_axis_label=ylabel)
plot_emb.circle( 'x', 'y', source = source2, size='size',
                alpha='alpha', line_alpha=0, line_width=0.01, name="df", 
                fill_color = 'colors', 
                line_color="black", hover_fill_color="firebrick")
plot_emb.xaxis.axis_label_text_font_style = 'normal'
plot_emb.xaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_size = '18px'
plot_emb.yaxis.axis_label_text_font_style = 'normal'
plot_emb.title.align = 'center'
plot_emb.title.text_font_size = '18px'

show(plot_emb)

# Downloadable File Links

In [None]:
uf.archive(path, output_name+"_")

In [None]:
display(FileLink('reactome_output_archive.zip', result_html_prefix='Archive of all files: '))
display(FileLinks('output'))