# MoTrPAC Rat Endurance Exercise Training Transcriptomics
This updated MoTrPAC processing notebook is based off the [notebook](https://github.com/MaayanLab/EnrichrPythonScripts/blob/08aa5fe3620189835f4fe2b03ae607a93f42ce5f/Enrichr/MoTrPAC.ipynb) used to processs the MoTrPAC dataset for Enrichr. This notebook contains the code used to process the MoTrPAC Rat Endurance Exercise Training dataset for Harmonizome. Tissue samples from rats aged 1-8 weeks were collected from MoTrPAC timewise differential expression analaysis and training files. These files are availalble for donwload from the MoTrPAC [download portal](https://motrpac-data.org/data-download).

In [None]:
import pandas as pd
import datetime
import numpy as np
import os
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
feature_df = pd.read_csv("transcriptomics/analysis/transcript-rna-seq/feature-annot/TRNSCRPT_FEATURE_ANNOT.txt", sep="\t")
feature_df.head()

In [None]:
gene_mapper = {}
for i, row in feature_df.iterrows():
    gene_id = row["gene_id"]
    gene_name = row["gene_name"]
    gene_mapper[gene_id] = gene_name

symbolmap = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None).set_index(1)[2].to_dict()

### Timewise Data

In [None]:
motrpacdir = 'transcriptomics/analysis/transcript-rna-seq/dea/'
motrpac = pd.DataFrame([])
for rnaseqfile in os.listdir(motrpacdir):
    if 'timewise' in rnaseqfile:
        rnaseq = pd.read_csv(motrpacdir+rnaseqfile, sep='\t').get(['feature_ID', 'tissue', 'sex', 'comparison_group', 'adj_p_value', 'logFC'])
        rnaseq['term'] = rnaseq['tissue']+'_'+rnaseq['sex']+'_'+rnaseq['comparison_group']
        motrpac = pd.concat([motrpac, rnaseq])
motrpac

In [None]:
motrpac['feature_ID'] = motrpac['feature_ID'].map(gene_mapper)
motrpac = motrpac.dropna()
motrpac

In [None]:
motrpac = motrpac[['term', 'feature_ID', 'adj_p_value', 'logFC']]
motrpac = motrpac[motrpac['adj_p_value']<0.05]
motrpac

In [None]:
def threshold(score):
    if score > 0:
        return 1
    elif score < 0:
        return -1

In [None]:
motrpac['feature_ID'] = motrpac['feature_ID'].apply(str.upper).map(symbolmap)
motrpac = motrpac.dropna()
motrpac['logFC'] = motrpac['logFC'].apply(threshold)
motrpac.columns = ['term', 'gene', 'adj_p_value', 'threshold']
motrpac

### Training Data

In [None]:
motrpacdir = 'transcriptomics/analysis/transcript-rna-seq/dea/'
motrpactraining = pd.DataFrame([])
for rnaseqfile in os.listdir(motrpacdir):
    if 'training' in rnaseqfile:
        rnaseq = pd.read_csv(motrpacdir+rnaseqfile, sep='\t').get(['feature_ID', 'tissue', 'adj_p_value'])
        motrpactraining = pd.concat([motrpactraining, rnaseq])
motrpactraining

In [None]:
motrpactraining['feature_ID'] = motrpactraining['feature_ID'].map(gene_mapper)
motrpactraining = motrpactraining.dropna()
motrpactraining

In [None]:
motrpactraining = motrpactraining[['tissue', 'feature_ID', 'adj_p_value']]
motrpactraining = motrpactraining[motrpactraining['adj_p_value']<0.05]
motrpactraining

In [None]:
motrpactraining['feature_ID'] = motrpactraining['feature_ID'].apply(str.upper).map(symbolmap)
motrpactraining = motrpactraining.dropna()
motrpactraining['threshold'] = 1
motrpactraining

In [None]:
motrpactraining['tissue'] = motrpactraining['tissue'] + '_consensus'
motrpactraining.columns = ['term', 'gene', 'adj_p_value', 'threshold']
motrpactraining

### Combine Timewise and Training Data

In [None]:
motrpac = pd.concat([motrpac, motrpactraining]).reset_index(drop=True)
motrpac

In [None]:
motrpac['adj_p_value'] = motrpac['adj_p_value'].apply(lambda x: np.log10(x)*-1)
motrpac['adj_p_value'] = motrpac['adj_p_value'].mul(motrpac['threshold'])
motrpac

In [None]:
error

## Process Data for SQL

### Gene

In [None]:
index = 58399
gene_info = pd.read_csv('../../tables/gene_info', sep='\t')[['GeneID', 'Symbol', 'description']].set_index('Symbol')
genes = pd.read_csv('../../tables/gene.csv', index_col='symbol')
genefks = genes['id'].to_dict()
for gene in motrpac['gene'].unique():
    if gene.upper() not in genes.index:
        geneid = gene_info.loc[gene,'GeneID']
        print((index, gene, geneid, gene_info.loc[gene,'description'], f'https://ncbi.nlm.nih.gov/gene/{geneid}'), end=',\n')
        genefks[gene] = index
        index += 1

### Attribute

In [None]:
index = 401548
attributefks = {}

for term in motrpac['term'].unique():
    if 'consensus' in term:
        description = 'consensus sample identified as [tissue]_consensus'
    else:
        description = 'rat tissue sample identified by [tissue]_[sex]_[age in weeks]'
    print ((index, term, description, 103), end=',\n')
    attributefks[term] = index
    index += 1

### Gene Set

In [None]:
index = 134700000
genesetfks = {}

for term in motrpac['term'].unique():
    if 'consensus' in term:
        description = 'consensus sample identified as [tissue]_consensus'
    else:
        description = 'rat tissue sample identified by [tissue]_[sex]_[age in weeks]'
    print ((index, term, description, 146, 5, attributefks[term]), end=',\n')
    genesetfks[term] = index
    index += 1

### Association

In [None]:
association = motrpac.copy()
association['gene_fk'] = association['gene'].apply(lambda x: genefks[x.upper()])
association['gene_set_fk'] = association['term'].apply(lambda x: genesetfks[x])
association = association[['gene_fk', 'gene_set_fk', 'adj_p_value', 'threshold']]
association.columns = ['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']
association = association.drop_duplicates(subset=['gene_fk','gene_set_fk']).reset_index(drop='true').rename_axis('id')
association.index += 28000000
association.to_csv('../../harmonizome-update/motrpacnew.tsv', sep='\t')
association

## Create Downloads

In [None]:
output_path = 'downloads/'

### Ternary Matrix

In [None]:
ternarymatrix = pd.crosstab(motrpac['gene'], motrpac['term'], motrpac['threshold'], aggfunc=max).replace(np.nan, 0)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(f'{output_path}gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene Attribute Edge List

In [None]:
geneids = gene_info['GeneID'].to_dict()
edgelist = motrpac.copy()
edgelist['Gene ID'] = edgelist['gene'].apply(lambda x: geneids[x])
edgelist = edgelist.get(['gene', 'Gene ID', 'term', 'adj_p_value', 'threshold'])
edgelist.columns = ['Gene', 'Gene ID', 'Term', 'Standardized Value', 'Threshold Value']
edgelist.to_csv(f'{output_path}gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(f'{output_path}gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Term']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(f'{output_path}attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(f'{output_path}gene_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], '', *ternarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(f'{output_path}gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], '', *ternarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(f'{output_path}attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], '', *ternarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(f'{output_path}attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i], '', *ternarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(f'{output_path}gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(f'{output_path}attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Gene Attribute Standardized Matrix

In [None]:
standardizedmatrix = pd.crosstab(motrpac['gene'], motrpac['term'], motrpac['adj_p_value'], aggfunc=max).replace(np.nan, 0)
standardizedmatrix.to_csv(f'{output_path}gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for sample in attributeslist.index:
    sample = attributeslist.loc[sample]
    nodes['MoTrPAC_'+sample['Term']] = {
        "type":"tissue sample",
        "properties": {
            "label":sample['Term'],
            "id":'MoTrPAC_'+sample['Term']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold Value']==1:
        edges.append({
            "source": int(edge['Gene ID']),
            "relation": "over-expressed in",
            "target": 'MoTrPAC_'+edge['Term'],
            "properties":{
                "id":str(edge['Gene ID'])+":MoTrPAC_"+edge['Term'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":'MoTrPAC_'+edge['Term'],
                "target_label":edge['Term'],
                "directed":True,
                "standardized_value":edge['Standardized Value'],
                "threshold":1
            }})
    else:
            edges.append({
            "source": int(edge['Gene ID']),
            "relation": "under-expressed in",
            "target": 'MoTrPAC_'+edge['Term'],
            "properties":{
                "id":str(edge['Gene ID'])+":MoTrPAC_"+edge['Term'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":'MoTrPAC_'+edge['Term'],
                "target_label":edge['Term'],
                "directed":True,
                "standardized_value":edge['Standardized Value'],
                "threshold":-1
            }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/motrpac.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)

    print('', file=f)
    for edge in edges:
        if edge["properties"]["threshold"]==1:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002245', edge['properties']['target_id'], end=' .\n', file=f)
        else:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002246 ', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/motrpac.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
namespace = {'gene':'NCBI Entrez', 'tissue sample':'MoTrPAC'}
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: namespace[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/motrpac_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized_value'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'standardized', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/motrpac_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

## Gene Attribute Heat Map

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        if 'consensus' not in term:
            term = f'{term}_up'
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads_new/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[f'{term}_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open('downloads_new/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the MoTrPAC Rat Endurance Exercise Training Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=3,mindist=0.1
     ,spread=1.5 
     #,maxdf=0.5
     #,mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in MoTrPAC Rat Endruance Exercise Training Library')
save(plot)