# GTEx Tissue Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [2]:
name='GTEx Tissue Gene Expression Profiles'
description="Gene expression profiles for tissues from GTEx by RNA-seq, updated for 2023"

In [3]:
short_citation = 'GTEx Consortium, Science, 2020'
title = 'The GTEx Consortium atlas of genetic regulatory effects across hman tissues'
author = 'GTEx Consortium'
journal = 'Science'
year = 2020
volume = 369
pages = '1318-1330'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="The Genotype-Tissue Expression Project", url="https://gtexportal.org/home/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://gtexportal.org/home/license"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/GTEx+Tissue+Gene+Expression+Profiles+2023"
date_published=datetime.date(2023, 11, 1)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/gtextissue23/gene_attribute_matrix_standardized.txt.gz'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/gzip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='gzip')
display(matrix)

Unnamed: 0,Gene,Adipose - Subcutaneous,Adipose - Visceral (Omentum),Adrenal Gland,Artery - Aorta,Artery - Coronary,Artery - Tibial,Bladder,Brain - Amygdala,Brain - Anterior cingulate cortex (BA24),...,Skin - Not Sun Exposed (Suprapubic),Skin - Sun Exposed (Lower leg),Small Intestine - Terminal Ileum,Spleen,Stomach,Testis,Thyroid,Uterus,Vagina,Whole Blood
0,A1BG,-0.156566,-0.150570,-0.174442,-0.160785,-0.105158,-0.122812,-0.176774,-0.115151,-0.131250,...,-0.168557,-0.170112,-0.151680,-0.108378,-0.180105,-0.178994,-0.145685,-0.110820,-0.134026,-0.171888
1,A1CF,-0.198620,-0.198620,-0.196606,-0.197949,-0.198620,-0.197949,-0.198620,-0.199963,-0.198620,...,-0.198620,-0.197949,0.863623,-0.198620,-0.159676,-0.192577,-0.197949,-0.199292,-0.198620,-0.198620
2,A2M,0.727863,0.770095,-0.318623,2.132138,1.624738,1.664368,0.758651,-0.580190,-0.597619,...,-0.572781,-0.486621,-0.092118,-0.115616,-0.323947,-0.605045,0.104905,0.358384,-0.284003,-0.738303
3,A2ML1,-0.207391,-0.207525,-0.206953,-0.207122,-0.206987,-0.207391,-0.204633,-0.187080,-0.184927,...,0.726517,0.676412,-0.207323,-0.207189,-0.207256,-0.119690,-0.207323,-0.205541,1.754785,-0.208063
4,A3GALT2,0.040568,0.332660,-0.251523,0.332660,0.332660,0.332660,0.040568,-0.835707,-0.835707,...,-0.543615,-0.543615,-0.251523,0.624752,-0.543615,5.298220,0.624752,-0.251523,-0.251523,1.793119
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18091,ZYG11A,-0.339492,-0.339492,-0.371598,-0.331466,-0.331466,-0.347519,-0.283307,-0.359559,-0.359559,...,-0.251201,-0.203041,-0.291333,-0.243174,-0.339492,5.664350,2.277155,-0.315413,0.025715,-0.371598
18092,ZYG11B,0.031657,-0.302870,0.024830,-0.307649,-0.190564,0.147718,0.100611,-0.295019,0.480538,...,-0.582780,-0.507000,-0.621695,-0.881124,-0.887268,-0.031152,0.108804,0.383594,-0.329837,-1.368578
18093,ZYX,0.386113,0.287837,-0.668390,1.660111,0.975860,1.472304,0.800317,-0.964593,-0.832099,...,-0.525520,-0.311515,-0.217955,0.268541,-0.588680,-0.283644,-0.498721,2.667834,0.096000,3.667411
18094,ZZEF1,0.357843,-0.280143,0.211628,0.212124,0.038354,0.538068,0.409974,-1.494800,-1.365713,...,0.462602,0.647295,3.508553,1.265919,0.044063,0.476007,0.749075,0.356354,-0.337984,-0.954869


In [8]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.FLOAT,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

55


In [9]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [10]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [11]:
with open("gtextissue.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline