# GTEx eQTL 2025


In [None]:
import pandas as pd
import datetime
import goenrich
import os
import numpy as np
import scipy.spatial.distance as dist
import seaborn as sns
import sys
from tqdm import tqdm
import json
import scanpy as sc

from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Load and Pre-Process Data

In [None]:
eqtl_dir = 'GTEx_Analysis_v10_eQTL_updated'
eqtl = pd.DataFrame(columns=['gene_id', 'gene_name', 'biotype', 'gene_chr', 'gene_start', 'gene_end',
       'strand', 'num_var', 'beta_shape1', 'beta_shape2', 'true_df',
       'pval_true_df', 'variant_id', 'tss_distance', 'chr', 'variant_pos',
       'ref', 'alt', 'num_alt_per_site', 'rs_id_dbSNP155_GRCh38p13',
       'ma_samples', 'ma_count', 'af', 'pval_nominal', 'slope', 'slope_se',
       'pval_perm', 'pval_beta', 'qval', 'pval_nominal_threshold', 'afc',
       'afc_se'])
for tissue in tqdm(os.listdir(eqtl_dir)):
    if 'parquet' not in tissue:
        matrix = pd.read_csv(f'{eqtl_dir}/{tissue}', sep='\t', compression='gzip')
        eqtl = pd.concat([eqtl, matrix], axis=0)

eqtl

In [None]:
eqtl = eqtl[['gene_id', 'gene_name', 'biotype', 'variant_id', 'rs_id_dbSNP155_GRCh38p13', 'qval']].sort_values('qval')
eqtl['gene_name'] = eqtl['gene_name'].str.upper()
eqtl = eqtl[(eqtl['biotype']=='protein_coding')]
eqtl

In [None]:
gene_info = pd.read_csv('../../tables/Homo_sapiens.gene_info.gz', sep='\t', compression='gzip')
gene_info = gene_info[(gene_info['#tax_id']==9606) & (gene_info['type_of_gene']=='protein-coding')]
gene_info['Symbol'] = gene_info['Symbol'].str.upper()
symbols = set(gene_info['Symbol'])
gene_info

In [None]:
synonyms = gene_info.copy()
synonyms['Synonyms'] = synonyms.apply(lambda x: x['Synonyms'].upper().split('|') + [x['Symbol']], axis=1)
synonyms = synonyms.explode('Synonyms')
synonyms = synonyms[synonyms['Synonyms']!='-']
synonyms = synonyms.set_index('Synonyms')['Symbol'].to_dict()

In [None]:
eqtl['gene_name'] = eqtl['gene_name'].map(synonyms)
eqtl = eqtl.dropna().groupby(['gene_name', 'variant_id', 'rs_id_dbSNP155_GRCh38p13'])['qval'].mean().sort_values().reset_index()
eqtl['Score'] = eqtl['qval'].map(lambda x: -np.log10(x)/10)
filt = eqtl[eqtl['qval']<1e-10]
eqtl

In [None]:
display(filt)
filt.nunique()

In [None]:
for i in range(2,11):
    x = eqtl[eqtl['qval']<1*10**-i]
    print('q<'+str(1*10**-i), x['gene_name'].nunique(), 'genes,', x['variant_id'].nunique(), 'SNPs,', x.shape[0], 'associations')

In [None]:
rs_dict = eqtl[['variant_id', 'rs_id_dbSNP155_GRCh38p13']].drop_duplicates().set_index('variant_id')['rs_id_dbSNP155_GRCh38p13'].to_dict()

In [None]:
error

## Process Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(175, 'GTEx eQTL 2025', 'eQTL 2025', 'Significance values for all gene-SNP pairs testing likelihood that SNP affects gene expression', 'gene-SNP associations by likelihood that SNP regulates gene', 'genes with expression regulated by the {0} SNP from the GTEx eQTL 2025 dataset.', 'sets of genes with expression regulated by SNPs from the GTEx 2025 eQTL dataset.', 'SNPs regulating expression of {0} gene from the GTEx eQTL 2025 dataset.', 0, 1, '2025-04-22', 'gtexeqtl25', 7510, 29, 11, 2, 44, 8, 'expression quantitative trait loci by single nucleotide polymorphism microarray and RNA-seq', 'primary experimental data', 'high throughput, data driven', 'SNPs')

### Publication

In [None]:
(263, 175, 150)

### Gene

In [None]:
genes = pd.read_csv('../../tables/gene.csv')
genes['symbol'] = genes['symbol'].str.upper()
genelist = set(genes['symbol'])
genefks = genes.set_index('symbol')['id'].to_dict()
geneids = gene_info.set_index('Symbol')['GeneID'].drop_duplicates()
genedescs = gene_info.set_index('Symbol')['description'].drop_duplicates()
index = 58842
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

for gene in filt['gene_name'].unique():
    if gene not in genelist:
        geneid = geneids[gene]
        print((index, gene, geneid, genedescs[gene], geneurl+str(geneid)), end=',\n')
        genefks[gene] = index
        index += 1

### Attribute

In [None]:
index = 466782
attributes = pd.read_csv('../../tables/attribute.tsv', sep='\t')
attributes['name_from_naming_authority']= attributes['name_from_naming_authority'].str.lower()
attributefks = attributes.set_index('name_from_naming_authority')['id'].to_dict()
dbsnpurl = 'https://www.ncbi.nlm.nih.gov/snp/'
for snp in filt['variant_id'].unique():
    if snp.lower() not in attributefks:
        print((index, snp, rs_dict[snp], f'SNP on Chromosome {snp.split("_")[0].replace("chr","")}', dbsnpurl+rs_dict[snp], 53), end=',\n')
        attributefks[snp.lower()] = index
        index += 1

### Gene Set

In [None]:
index = 137600000
genesetfks = {}
dbsnpurl = 'https://www.ncbi.nlm.nih.gov/snp/'
for snp in filt['variant_id'].unique():
    print((index, snp, rs_dict[snp], f'SNP on Chromosome {snp.split("_")[0].replace("chr","")}', dbsnpurl+rs_dict[snp], 175, 44, attributefks[snp.lower()]), end=',\n')
    genesetfks[snp] = index
    index += 1

### Association

In [None]:
associations = filt.copy()
associations['gene_name'] = associations['gene_name'].map(genefks)
associations['variant_id'] = associations['variant_id'].map(genesetfks)
associations = associations[['gene_name', 'variant_id', 'Score']]
associations.columns = ['gene_fk', 'gene_set_fk', 'standardized_value']
associations['threshold_value'] = 1
associations.index += 66000000
associations.to_csv('../../harmonizome-update/gtexeqtl25.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'

### Binary Matrix

In [None]:
binarymatrix = pd.crosstab(filt['gene_name'], filt['variant_id'])
binarymatrixT = binarymatrix.T
binarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
binarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = filt.reset_index(drop=True)
edgelist['Gene ID'] = edgelist['gene_name'].map(geneids).astype(int)
edgelist = edgelist[['gene_name', 'Gene ID', 'variant_id', 'rs_id_dbSNP155_GRCh38p13', 'Score']]
edgelist.columns = ['Gene', 'Gene ID', 'SNP', 'SNP ID', 'Standardized Value']
edgelist['Threshold Value'] = 1
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist[['Gene', 'Gene ID']].drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist[['SNP', 'SNP ID']].drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Gene Set Library

In [None]:
with open(output_path+'gene_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = binarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], '', *binarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_crisp.gmt', 'w') as f:
    arr = binarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = binarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*binarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], '', *binarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

In [None]:
binarymatrix = pd.read_csv('downloads/gene_attribute_matrix.txt.gz', sep='\t', compression='gzip', index_col=0).rename_axis('Gene')

### Gene-Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(binarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=binarymatrix.index, columns=binarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute-Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(binarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=binarymatrixT.index, columns=binarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Standardized Matrix

In [None]:
eqtl[(eqtl['gene_name'].isin(filt['gene_name'])) & (eqtl['variant_id'].isin(filt['variant_id']))]

In [None]:
eqtl = eqtl[(eqtl['gene_name'].isin(filt['gene_name'])) & (eqtl['variant_id'].isin(filt['variant_id']))]
standardizedmatrix = pd.crosstab(eqtl['gene_name'], eqtl['variant_id'], values=eqtl['Score'], aggfunc='mean').fillna(0)
standardizedmatrix.to_csv(
    f'{output_path}gene_attribute_matrix_standardized.txt.gz',
    sep='\t',
    compression='gzip',
)
standardizedmatrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for snp in attributeslist.index:
    snp = attributeslist.loc[snp]
    nodes[snp['SNP']] = {
        "type":"SNP",
        "properties": {
            "id":snp['SNP'],
            "label":snp['SNP ID']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    edges.append({
        "source": int(edge['Gene ID']),
        "relation": "has SNP",
        "target": edge['SNP'],
        "properties":{
            "id":str(edge['Gene ID'])+":"+edge['SNP'],
            "source_id":int(edge['Gene ID']),
            "source_label":edge['Gene'],
            "target_label":edge['SNP ID'],
            "target_id":edge['SNP'],
            "directed":True,
            "standardized":edge['Standardized Value'],
            "threshold":1
        }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/gtexeqtl25.rdf', 'w') as f:
    print('@prefix gene: <https://www.ncbi.nlm.nih.gov/gene/> .', file=f)
    print('@prefix RO: <http://purl.obolibrary.org/RO_> .', file=f)
    print('@prefix GTExeQTL: <https://www.gtexportal.org/home/snp/>.', file=f)
    print('', file=f)
    for edge in edges:
        print('gene:'+str(edge['properties']['source_id']), '<has SNP>', edge['properties']['target_id'], end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/gtexeqtl25.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'SNP':'GTEx'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/gtexeqtl25_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['standardized'] = edgeframe['properties'].apply(lambda x: x['standardized'])
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'standardized', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/gtexeqtl25_tsv/edges.tsv', sep='\t')
edgeframe

### Gene Attribute Heat Map

In [None]:
sns.clustermap(binarymatrix, cmap='seismic', center=0)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.heatmap(attribute_similarity_matrix, cmap='seismic', center=0)