# GTEx Tissue Expression
This notebook contains the processing files for the GTEx Tissue Gene Expression Profiles 2023 dataset. A gene count matrix was downloaded from the [GTEx Data Portal](https://gtexportal.org/home/downloads/adult-gtex). Gene-wise z-scoring was applied to median gene counts across tissues, and the top and bottom 1000 associations for each tissue were kept. The final edgelist contains 108,000 associations between 54 tissues and 17,360 genes.

In [None]:
import pandas as pd
import datetime
import math
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
gtextissue = pd.read_csv('bulk-gex_v8_rna-seq_GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.gct', sep='\t', skiprows=2)
gtextissue

In [None]:
geneinfo = pd.read_csv('../../tables/gene_info', sep='\t')
geneinfo['Symbol'] = geneinfo['Symbol'].apply(str.upper)
geneinfo = geneinfo[geneinfo['#tax_id']==9606][geneinfo['type_of_gene']=='protein-coding'].set_index('Symbol').get(['GeneID','description'])
geneinfo

In [None]:
genemapping = pd.read_csv('../../mapping/mappingFile_2023.tsv', sep='\t', header=None, index_col=1)[2].to_dict()
gtextissue['Gene'] = gtextissue['Description'].map(genemapping)
gtextissue = gtextissue.dropna(subset='Gene').drop_duplicates(subset='Gene').set_index('Gene').drop(columns=['Name','Description'])
gtextissue = gtextissue[gtextissue.index.map(lambda x: x in geneinfo.index)]
gtextissue

### Load Sample Metadata

In [None]:
samplemeta = pd.read_csv('GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt', sep='\t', index_col=0)
tissuedict = samplemeta['SMTSD'].to_dict()
uberon = samplemeta.set_index('SMTSD')['SMUBRID'].to_dict()
samplemeta

In [None]:
gtextissue.columns = gtextissue.columns.map(tissuedict)
gtextissue = gtextissue.groupby(axis=1, level=0).median().rename_axis('Tissue', axis=1)
gtextissue

In [None]:
def zscore(gene):
    mean = gene.mean()
    std = gene.std()
    gene = gene.apply(lambda x: (x-mean)/std)
    return gene

In [None]:
gtextissuez = gtextissue.apply(zscore, axis=1)
gtextissuez

In [None]:
gtextissue = gtextissuez.stack().to_frame('z')
gtextissueup = gtextissue.reset_index().groupby('Tissue').apply(lambda x: x.nlargest(1000, 'z')).reset_index(drop=True)
gtextissuedn = gtextissue.reset_index().groupby('Tissue').apply(lambda x: x.nsmallest(1000, 'z')).reset_index(drop=True)
gtextissue = pd.concat([gtextissueup, gtextissuedn]).reset_index(drop=True)
gtextissue

## Process Data for SQL

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, positive_association, negative_association, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural, is_archived)
(151, 'GTEx Tissue Gene Expression Profiles 2023', 'Tissue Gene Expression Profiles 2023', 'gene expression profiles for tissues from GEx by RNA-seq', 'gene-tissue associations by differential expression of gene acoss tissues', 'genes with high or low expression in {0} relative to other tissues from the GTEx Tissue Gene Expression Profiles 2023 dataset.', 'sets of genes wiith high or low expression in each tissue relative to other tissues from the GTEx Tissue Gene Expression Profiles 2023 dataset.', 'tissues with high or low expression of {0} gene relative to other tissues from the GTEx Tissue Gene Expression Profiles 2023 dataset.', 'high expression', 'low expression', 1, 1, '2023-11-01', 'gtextissue23', 0, 29, 16, 7, 4, 1, 'gene expression by RNA-seq', 'primary experimental data', 'high throughput, data driven', 'tissues', 0)

### Publication

In [None]:
#(id, long_citation, short_citation, url, pmid, pubmed_url, first_author_last_name, first_author_initials, journal_abbreviation, year, title, volume, pages)
(150, 'GTEx Consortium, et al. (2020) The GTEx Consortium atlas of genetic regulatory effects across human tissues. Science. 369:1318-30.', 'GTEx, Science, 2020', 'dx.doi.org/10.1126/science.aaz1776', 32913098, 'https://ncbi.nlm.nih.gov/pubmed/32913098', 'GTEx', 'Consortium', 'Science', 2020, 'The GTEx Consortium atlas of genetic regulatory effects across human tissues', 369, '1318-30')

### Gene

In [None]:
index = 57646
genes = pd.read_csv('../../tables/gene.csv')
gtextissue['Gene'] = gtextissue['Gene'].apply(str.upper)
genelist = genes['symbol'].apply(str.upper).tolist()
genefks = genes.set_index('symbol')['id'].to_dict()

for gene in gtextissue['Gene'].unique():
    if gene not in genelist:
        print((index, gene, geneinfo.loc[gene, 'GeneID'], geneinfo.loc[gene, 'description'], 'https://ncbi.nlm.nih.gov/gene/'+str(geneinfo.loc[gene,'GeneID'])), end=',\n')
        genefks[gene] = index
        index += 1

### Attribute

In [None]:
index = 392558
attributefks = {}
tissueids = {}
tissuedescs = pd.read_csv('tissuedescriptions.tsv', sep='    ', header=None).set_index(0)[1].to_dict()
for tissue in gtextissue['Tissue'].unique():
    tissueid = uberon[tissue]
    if 'EFO' in tissueid:
        tissueids[tissue] = tissueid
        print((index, tissue, tissuedescs[tissueid], tissueid, 'http://purl.obolibrary.org/obo'+tissueids[tissue],  8), end=',\n')
    else:
        tissueid = 'UBERON_'+tissueid
        tissueids[tissue] = tissueid
        print((index, tissue, tissuedescs[tissueid], tissueid, 'http://purl.obolibrary.org/obo'+tissueids[tissue], 25), end=',\n')
    attributefks[tissue] = index
    index += 1

### Gene Set

In [None]:
index = 135200000
genesetfks = {}
for tissue in gtextissue['Tissue'].unique():
    print((index, tissue, tissuedescs[tissueids[tissue]], tissueids[tissue], 'http://purl.obolibrary.org/obo'+tissueids[tissue], 151, 4, attributefks[tissue]), end=',\n')
    genesetfks[tissue] = index
    index += 1

### Association

In [None]:
def threshold(z):
    if z>0:
        return 1
    elif z<0:
        return -1
    return 0

In [None]:
associations = gtextissue.copy()
associations['Gene'] = associations['Gene'].apply(str.upper).map(genefks)
associations['Tissue'] = associations['Tissue'].map(genesetfks)
associations['threshold'] = associations['z'].map(threshold)
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value', 'threshold_value']
associations.index += 38000000
associations.to_csv('../../harmonizome-update/gtextissue23.csv')
associations

In [None]:
len(gtextissue['Gene'].unique()),len(gtextissue['Tissue'].unique())

## Create Downloads

In [None]:
def threshold(z):
    if z>0: return 1
    if z<0: return -1
    return 0

In [None]:
output_path = 'downloads/'
geneids = geneinfo['GeneID'].to_dict()
gtextissue['Gene ID'] = gtextissue['Gene'].map(geneids)
gtextissue['Tissue ID'] = gtextissue['Tissue'].map(tissueids)
gtextissue['threshold'] = gtextissue['z'].apply(threshold)
gtextissue = gtextissue[['Gene', 'Gene ID', 'Tissue', 'Tissue ID', 'z', 'threshold']]
gtextissue

### Gene Attribute Ternary Matrix

In [None]:
ternarymatrix = pd.crosstab(gtextissue['Gene'], gtextissue['Tissue'], gtextissue['threshold'], aggfunc=max).replace(np.nan, 0)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene Attribute Edge List

In [None]:
edgelist = gtextissue.copy()
edgelist.columns = ['Gene', 'Gene ID', 'Tissue', 'Tissue ID', 'Standardized Value', 'Threshold Value']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Tissue', 'Tissue ID']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], *ternarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], *ternarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], *ternarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i], *ternarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Gene Attribute Standardized Matrix

In [None]:
standardizedmatrix = pd.crosstab(gtextissue['Gene'], gtextissue['Tissue'], gtextissue['z'], aggfunc=max).replace(np.nan, 0)
standardizedmatrix.to_csv(output_path+'gene_attribute_matrix_standardized.txt.gz', sep='\t', compression='gzip')
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for agingsig in attributeslist.index:
    agingsig = attributeslist.loc[agingsig]
    nodes[agingsig['Tissue ID']] = {
        "type":"tissue",
        "properties": {
            "label":agingsig['Tissue'],
            "id":agingsig['Tissue ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold Value']==1:
        edges.append({
            "source": int(edge['Gene ID']),
            "relation": "over-expressed in",
            "target": edge['Tissue ID'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Tissue ID'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Tissue ID'],
                "target_label":edge['Tissue'],
                "directed":True,
                "standardized_value":edge['Standardized Value'],
                "threshold":1
            }})
    else:
            edges.append({
            "source": int(edge['Gene ID']),
            "relation": "under-expressed in",
            "target": edge['Tissue ID'],
            "properties":{
                "id":str(edge['Gene ID'])+":"+edge['Tissue ID'],
                "source_id":int(edge['Gene ID']),
                "source_label":edge['Gene'],
                "target_id":edge['Tissue ID'],
                "target_label":edge['Tissue'],
                "directed":True,
                "standardized_value":edge['Standardized Value'],
                "threshold":-1
            }})

In [None]:
edge

#### RDF

In [None]:
with open(output_path+'kg_serializations/gtextissue23.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    print('@prefix UBERON: purl.obolibrary.org/obo/UBERON_', file=f)
    print('@prefix EFO: purl.obolibrary.org/obo/EFO_', file=f)

    print('', file=f)
    for edge in edges:
        if edge["properties"]["threshold"]==1:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002245', edge['properties']['target_id'].replace('_', ':'), end=' .\n', file=f)
        else:
            print('gene:'+str(edge['properties']['source_id']), 'RO:0002246 ', edge['properties']['target_id'].replace('_', ':'), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/gtextissue23.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodes

In [None]:
def namespace(nodeid):
    nodeid = str(nodeid)
    if 'UBERON' in nodeid:
        return 'UBERON'
    elif 'EFO' in nodeid:
        return 'EFO'
    else:
        return 'NCBI Entrez'

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['id'].apply(namespace)
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/gtextissue23_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized_value'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/gtextissue23_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

In [None]:
ternarymatrix = pd.read_csv('downloads/gene_attribute_matrix.txt.gz', sep='\t', compression='gzip', index_col='Gene')
ternarymatrix

### Gene Attribute Clustered Heatmap

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0, figsize=(25,25))

### Gene Similarity Clustered Heatmap

In [None]:
gene_similarity_matrix = pd.read_csv('downloads/gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
attribute_similarity_matrix = pd.read_csv('downloads/attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip', index_col='Unnamed: 0')

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_up'] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, *geneset = line.strip().split('\t')
        gmt[term+'_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open('downloads/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors)
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the GTEx Tissue Gene Expression Profiles 2023 Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=8,mindist=0.05
     ,spread=2
     #,maxdf=20
     ,mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/gtexatissue23.html", title = 'Gene Sets in the GTEx Tissue Gene Expression Profiles 2023 Library')
save(plot)