# Harmonizome ETL: Reactome

**Authors**  
Created by: Charles Dai  
Updated by: Ido Diamant  
Credit to: Moshe Silverstein


Data source: http://reactome.org/download-data/

In [None]:
#%%appyter init
from appyter import magic
magic.init(lambda _=globals: _())

In [None]:
import os
import sys
from datetime import date

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import harmonizome.utility_functions as uf
import harmonizome.lookup as lookup

In [None]:
%load_ext autoreload
%autoreload 2

# Initilization

In [None]:
%%appyter hide_code
{% do SectionField(
    name= 'data',
    title= 'Upload Data',
    img='load_icon.png'
)%}

In [None]:
%%appyter code_eval
{% do DescriptionField(
    name= 'Description',
    text= 'The example below was sourced from <a href="http://reactome.org/pages/download=data/" target ="_blank">reactome.org</a>. If clicking on the example does not work, it should be downloaded directly from the source.',
    section='data'
)%}

{% set df_file = FileField(
    constrant= '.*/.gmt.zip$',
    name= 'pathways_gene',
    label= 'Pathway Gene Set (gmt.zip)',
    default= 'ReactomePathways.gmt.zip',
    examples= {
        'ReactomePathways.gmt.zip': 'https://reactome.org/download/current/ReactomePathways.gmt.zip'
    },
    section= 'data'
)%}

**Load and Save Mapping Dictionaries**

In [None]:
symbol_lookup, geneid_lookup = lookup.get_lookups()

**Output Path**

In [None]:
output_name = 'reactome'

path = 'Output/Reactome'
if not os.path.exists(path):
    os.makedirs(path)

# Load Data

In [None]:
%%appyter code_exec

df = pd.read_csv(
    {{df_file}},
    sep='%', header= None)

In [None]:
df.head()

In [None]:
df.shape

# Pre-process Data

In [None]:
df[0], df[1], df[2] = df[0].str.split('\t').str[0], df[0].str.split('\t').str[1], df[0].str.split('\t').str[2:]
df.columns= ['Pathway', 'Pathway ID', 'Gene Symbol']
df['Pathway'] = df['Pathway'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
df.head()

In [None]:
df = df.explode('Gene Symbol')
df = df.set_index('Gene Symbol')
df.head()

# Filter Data

**Map Gene Symbols to Up-to-date Approved Gene Symbols**

In [None]:
df = uf.map_symbols(df, symbol_lookup, remove_duplicates=True)
df.head()

In [None]:
df.shape

# Analyze Data

**Create Binary Matrix**

In [None]:
binary_matrix = uf.binary_matrix(df)
binary_matrix.head()

In [None]:
binary_matrix.shape

In [None]:
uf.save_data(binary_matrix, path, output_name + '_binary_matrix',
            compression = 'npz', dtype=np.uint8)

**Create Gene List**

In [None]:
gene_list = uf.gene_list(binary_matrix, geneid_lookup)
gene_list.head()

In [None]:
gene_list.shape

In [None]:
uf.save_data(gene_list, path, output_name + '_gene_list',
             ext= 'tsv', compression= 'gzip', index=False)

**Create Attribute List**

In [None]:
attribute_list = uf.attribute_list(binary_matrix)
attribute_list.head()

In [None]:
attribute_list.shape

In [None]:
uf.save_data(attribute_list, path, output_name + '_attribute_list',
             ext= 'tsv', compression= 'gzip')

**Create Gene Set Library and Attribute Set Library**

In [None]:
uf.save_setlib(binary_matrix, 'gene', 'up', path, output_name + '_gene_up_set')

In [None]:
uf.save_setlib(binary_matrix, 'attribute', 'up', path, output_name + '_attribute_up_set')

**Create Gene Similarity Matrix**

In [None]:
gene_similarity_matrix = uf.similarity_matrix(binary_matrix, 'jaccard', sparse=True)
gene_similarity_matrix.head()

In [None]:
uf.save_data(gene_similarity_matrix, path, 
             output_name + '_gene_similarity_matrix', 
             compression='npz', symmetric=True, dtype=np.float32)

**Create Attribute Similarity Matrix**

In [None]:
attribute_similarity_matrix = uf.similarity_matrix(binary_matrix.T, 'jaccard', sparse=True)
attribute_similarity_matrix.head()

In [None]:
uf.save_data(attribute_similarity_matrix, path, 
             output_name + '_attribute_similarity_matrix', 
             compression='npz', symmetric=True, dtype=np.float32)

**Create Gene-Attribute Edge List**

In [None]:
edge_list = uf.edge_list(binary_matrix)
uf.save_data(edge_list, path, output_name + '_edge_list',
             ext='tsv', compression='gzip')

**Create Turtle RDF**

In [None]:
pathwayid_lookup = df.set_index('Pathway')
pathwayid_lookup = pathwayid_lookup.to_dict()['Pathway ID']

In [None]:
uf.turtle(binary_matrix, geneid_lookup, pathwayid_lookup)

# Create Downloadable Save File

In [None]:
uf.archive(path)