# MoTrPAC Rat Endurance Exercise Training
This notebook contains the code used to process the MoTrPAC Rat Endurance Exercise Training dataset for Harmonizome. Tissue samples from rats aged 1-8 weeks were collected from MoTrPAC timewise differential expression analaysis files containing logFC gene expression values compared to 6-month old rats. These gene expression values were then normalized using gene-wise z-score normalization, keeping associations with a z-score with absoluve value ≥ 2.

In [None]:
import pandas as pd
import datetime
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-process Data

In [None]:
motrpacdir = 'transcriptomics/analysis/transcript-rna-seq/dea/'
motrpac = pd.DataFrame([])
for rnaseqfile in os.listdir(motrpacdir):
    if 'timewise' in rnaseqfile:
        rnaseq = pd.read_csv(motrpacdir+rnaseqfile, sep='\t').get(['feature_ID', 'tissue', 'sex', 'comparison_group', 'logFC'])
        rnaseq['term'] = rnaseq['tissue']+'_'+rnaseq['sex']+'_'+rnaseq['comparison_group']
        rnaseq = pd.crosstab(rnaseq['feature_ID'], rnaseq['term'], rnaseq['logFC'], aggfunc=max)
        motrpac = pd.concat([motrpac, rnaseq], axis=1)

In [None]:
motrpac

In [None]:
ensemblgenemapping = pd.read_csv('ensemblgenemapping.txt', sep='\t').set_index('Gene stable ID')['NCBI gene (formerly Entrezgene) ID'].to_dict()
ensembl2gene = pd.read_csv('gene2ensembl.txt', sep='\t').set_index('Ensembl_gene_identifier')['GeneID'].to_dict()

In [None]:
motrpac = motrpac.reset_index()
motrpac['genemapping'] = motrpac['feature_ID'].apply(lambda x: x in ensembl2gene)
motrpac = motrpac[motrpac['genemapping']]
motrpac['feature_ID'] = motrpac['feature_ID'].apply(lambda x: ensembl2gene[x])
motrpac = motrpac.set_index('feature_ID').drop(columns=['genemapping'])
motrpac

In [None]:
mammalgeneinfo = pd.read_csv('../../tables/mammal_gene_info', sep='\t')
ratgenes = mammalgeneinfo[mammalgeneinfo['#tax_id']==10116].set_index('GeneID')['Symbol'].to_dict()
motrpac.index = motrpac.index.map(lambda x: ratgenes[x])
motrpac

In [None]:
def mapindex(matrix):
    newmatrix = matrix.copy()
    newmatrix['mapped'] = newmatrix.index.map(lambda x: x.upper() in symbolmap)
    newmatrix = newmatrix[newmatrix['mapped']]
    newmatrix.index = newmatrix.index.map(lambda x: symbolmap[x.upper()])
    return newmatrix.drop(columns=['mapped'])

In [None]:
symbolmap = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()
motrpac = mapindex(motrpac)
motrpac = motrpac.dropna(axis=0, thresh=motrpac.shape[1]/2)
motrpac

In [None]:
motrpac = motrpac.T.fillna(motrpac.mean(axis=1), axis=0).T
def zscore(gene):
    mean = gene.mean()
    std = gene.std()
    gene = gene.apply(lambda x: (x-mean)/std)
    return gene

motrpac = motrpac.apply(zscore, axis=1)
motrpac

In [None]:
motrpac = motrpac[np.abs(motrpac)>2].stack().to_frame().sort_values(0).reset_index()
motrpac.columns = ['Gene', 'Sample', 'z']
motrpac

## Process Data for SQL

### Resource

In [None]:
#(id, name, acronym, long_description, short_description, url, num_attributes, num_datasets)
(107, 'Molecular Transducers of Physical Activity Consortium', 'MoTrPAC', 'MoTrPAC is a national research consortium designed to discover and perform preliminary characterization of the range of molecular transducers (the "molecular map") that underlie the effects of physical activity in humans.', 'MoTrPAC studies the molecular changes during and after exercise to understand how physical activity affects health.', 'https://www.motrpac.org', 142, 1)

### Dataset

In [None]:
print('(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, positive_association, negative_association, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)')
(146, 'MoTrPAC Rat Endurance Exercise Training', 'Rat Endurance Exercise Training Transcriptomics', 'RNA-seq gene expression profiles for rat tissue samples across 4 time points and 19 tissues', 'gene-tissue sample association by differential expression of gene across tissue samples', 'genes with high or low expression in {0} relative to other tissue samples from the MoTrPAC Rat Endurance Exercise Training dataset.', 'sets of genes with high or low expression in each tissue sample relative to other tissue samples from the MoTrPAC Rat Endurance Exercise Training dataset.', 'tissue samples with high or low expression of {0} gene relative to other tissue samples from the MoTrPAC Rat Endurance Exercise Training dataset.', 'high expression', 'low expression', 1, 1, '2023-10-11', 'motrpac', 0, 107, 16, 7, 5, 1, 'gene expression by RNA-seq', 'primary experimental data', 'high throughput, data driven', 'tissue samples', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(145, 'Sanford JA, et al. (2020) Molecular Transducers of Physical Activity Consortium (MoTrPAC): Mapping the Dynamic Responses to Exercis. Cell. 181:1464-1474', 'Sanford, Cell, 2020.', 'dx.doi.org/10.1016/j.cell.2020.06.004', 32589957, 'https://pubmed.ncbi.nlm.nih.gov/32589957', 'Sanford', 'JA', 'Cell', 2020, 'Molecular Transducers of Physical Activity Consortium (MoTrPAC): Mapping the Dynamic Responses to Exercise', 181, '1464-1474')

### Genes

In [None]:
index = 57376

geneinfo = pd.read_csv('../../tables/gene_info', sep='\t').drop_duplicates('Symbol').set_index('Symbol')
geneinfo = geneinfo[geneinfo['type_of_gene']=='protein-coding']
genes = pd.read_csv('../../tables/gene.csv')
genes['symbol'] = genes['symbol'].apply(str.upper)
genefks = genes.set_index('symbol')['id'].to_dict()
genes = genes['symbol'].to_list()

motrpac['Gene'] = motrpac['Gene'].apply(str.upper)
motrpac = motrpac.set_index('Gene')
for gene in motrpac.index.unique():
    if gene not in genes and gene in geneinfo.index:
        print((index, gene, geneinfo.loc[gene, 'GeneID'], geneinfo.loc[gene, 'description'], 'https://ncbi.nlm.nih.gov/gene/'+str(geneinfo.loc[gene, 'GeneID'])), end=',\n')
        genefks[gene] = index
        index += 1
    elif gene not in geneinfo.index:
        motrpac = motrpac.drop(gene)
motrpac = motrpac.reset_index()

### Naming Authority

In [None]:
(103, 'Molecular Transducers of Physical Activity Consortium', 'MoTrPAC', 'MoTrPAC studies the molecular changes during and after exercise to understand how physical activity affects health.','https://www.motrpac.org', 145)

### Attributes

In [None]:
index = 365859
attributefks = {}

for sample in motrpac['Sample'].unique():
    print ((index, sample, 'rat tissue sample identified by [tissue]_[sex]_[age in weeks]', 103), end=',\n')
    attributefks[sample] = index
    index += 1

### Gene Sets

In [None]:
index = 134700000
genesetfks = {}

for sample in motrpac['Sample'].unique():
    print ((index, sample, 'rat tissue sample identified by [tissue]_[sex]_[age in weeks]', 146, 5, attributefks[sample]), end=',\n')
    genesetfks[sample] = index
    index += 1

### Associations

In [None]:
def threshold(x):
    if x>0:
        return 1
    elif x<0:
        return -1
    return x

In [None]:
index = 28000000

associations = motrpac.copy()
associations['Gene'] = associations['Gene'].apply(lambda x: genefks[x])
associations['Sample'] = associations['Sample'].apply(lambda x: genesetfks[x])
associations['Threshold'] = associations['z'].apply(lambda x: threshold(x))
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']
associations.index += index
associations.to_csv('../../harmonizome-update/motrpac.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'
motrpac['threshold'] = motrpac['z'].apply(threshold)
motrpac

### Gene Attribute Ternary Matrix

In [None]:
ternarymatrix = pd.crosstab(motrpac['Gene'], motrpac['Sample'], motrpac['threshold'], aggfunc=max).replace(np.nan, 0)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene Attribute Edge List

In [None]:
geneids = geneinfo['GeneID'].to_dict()
edgelist = motrpac.copy()
edgelist['Gene ID'] = edgelist['Gene'].apply(lambda x: geneids[x])
edgelist = edgelist.get(['Gene', 'Gene ID', 'Sample', 'z', 'threshold'])
edgelist.columns = ['Gene', 'Gene ID', 'Sample', 'Z-score', 'Threshold Value']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Sample']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *ternarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], *ternarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *ternarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i], *ternarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Gene Attribute Standardized Matrix

In [None]:
standardizedmatrix = pd.crosstab(motrpac['Gene'], motrpac['Sample'], motrpac['z'], aggfunc=max).replace(np.nan, 0)
standardizedmatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for sample in attributeslist.index:
    sample = attributeslist.loc[sample]
    nodes[sample['Sample']] = {
        "type":"tissue sample",
        "properties": {
            "label":sample['Sample'],
            "id":sample['Sample']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold Value']==1:
        edges.append({
            "source": int(edge['Gene ID']),
            "relation": "over-expressed in",
            "target": edge['Sample'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Sample'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Sample'],
                "target_label":edge['Sample'],
                "directed":True,
                "standardized_value":edge['Z-score'],
                "threshold":1
            }})
    else:
            edges.append({
            "source": int(edge['Gene ID']),
            "relation": "under-expressed in",
            "target": edge['Sample'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Sample'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Sample'],
                "target_label":edge['Sample'],
                "directed":True,
                "standardized_value":edge['Z-score'],
                "threshold":-1
            }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/motrpac.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)

    print('', file=f)
    for edge in edges:
        if edge["properties"]["threshold"]==1:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002245', 'gene:'+edge['properties']['target_id'], end=' .\n', file=f)
        else:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002246 ', 'gene:'+edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/motrpac.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
namespace = {'gene':'NCBI Entrez', 'tissue sample':'MoTrPAC'}
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: namespace[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/motrpac_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized_value'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/motrpac_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene Attribute Clustered Heatmap

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_up'] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open('downloads/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the MoTrPAC Rat Endurance Exercise Training Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=10,mindist=0.1
     #,spread=1.5 
     #,maxdf=0.8 
     #,mindf=0.2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in MoTrPAC Rat Endruance Exercise Training Library')
save(plot)