# GTEx Tissue Sample Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [2]:
name='GTEx Tissue Sample Gene Expression Profiles'
description="Gene-tissue sample associations by differential expression of gene across tissue samples"

In [3]:
short_citation = 'GTEx Consortium, Science, 2015'
title = 'Human genomics. The Genotype-Tissue Expression (GTEx) pilot analysis: multitissue gene regulation in humans'
author = 'GTEx Consortium'
journal = 'Science'
year = 2015
volume = 348
pages = '648-660'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="The Genotype-Tissue Expression Project", url="https://gtexportal.org/home/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://gtexportal.org/home/license"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/GTEx+Tissue+Sample+Gene+Expression+Profiles"
date_published=datetime.date(2015, 4, 6)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/gtexsample/gene_attribute_matrix_standardized.txt.zip'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/zip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='zip')
display(matrix)

Unnamed: 0,GeneSym,GTEX-N7MS-0007-SM-2D7W1,GTEX-N7MS-0008-SM-4E3JI,GTEX-N7MS-0011-R10A-SM-2HMJK,GTEX-N7MS-0011-R11A-SM-2HMJS,GTEX-N7MS-0011-R1a-SM-2HMJG,GTEX-N7MS-0011-R2a-SM-2HML6,GTEX-N7MS-0011-R3a-SM-33HC6,GTEX-N7MS-0011-R4a-SM-2HMKW,GTEX-N7MS-0011-R5a-SM-2HMK8,...,GTEX-XYKS-1626-SM-4BRUQ,GTEX-XYKS-1726-SM-4E3IO,GTEX-XYKS-1826-SM-4E3JV,GTEX-XYKS-2126-SM-4E3IB,GTEX-XYKS-2226-SM-4E3IU,GTEX-XYKS-2426-SM-4AT43,GTEX-XYKS-2526-SM-4BOPX,GTEX-XYKS-2626-SM-4BRUT,GTEX-XYKS-2726-SM-4E3IC,GTEX-Y8E5-0006-SM-47JWQ
0,A1BG,-0.964151,0.660995,0.019721,0.720628,0.176152,0.224579,-0.039834,0.278527,0.388973,...,0.601416,0.038863,0.014535,-0.376712,-0.083809,-0.819895,0.755862,0.216851,0.097617,-0.811291
1,A1BG-AS1,-0.495306,0.572909,0.556190,1.412914,0.321324,0.042705,0.987109,0.079516,0.675136,...,0.446421,-0.243193,0.044189,-0.154137,-0.184313,-0.964118,0.392653,0.197975,-0.119116,-0.776338
2,A2M,-1.038290,-0.733332,-0.200511,-0.509111,-0.208774,-0.034390,-0.254067,-0.173735,-0.069482,...,0.202574,-0.121651,0.410014,-0.135501,-0.028970,-0.106922,0.119028,0.802910,0.085012,-1.945557
3,A2M-AS1,-0.783816,-0.947743,-0.245183,-0.112705,-0.229406,-0.044836,-0.370200,-0.265293,-0.213973,...,0.226658,-0.010592,0.090325,-0.310080,0.020250,-0.253030,0.175351,0.435032,0.023089,-1.238140
4,A2ML1,-0.782910,-0.992507,0.272209,-0.003609,0.274633,0.276170,0.116366,0.267865,0.443996,...,0.117370,0.013598,-0.278794,1.054349,-0.566623,-0.302294,0.474592,0.028869,-0.096926,-0.356445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19245,ZYG11A,-0.911723,0.658286,-0.420445,0.516886,-0.911723,-0.272469,-0.394870,-0.911723,-0.027062,...,0.011933,0.225891,0.166556,-0.199962,0.023229,-0.911723,-0.096583,-0.263910,-0.400880,-0.911723
19246,ZYG11B,-1.316360,0.012772,0.857888,0.874892,0.516327,0.594693,1.109993,0.094937,0.247287,...,0.191419,0.128198,0.042065,-0.414910,0.058127,1.158750,0.039850,0.268110,-0.073976,-1.748028
19247,ZYX,0.950811,0.303397,-0.288773,-0.063519,-0.581361,-0.723308,-0.475543,-1.538263,-1.059103,...,0.798438,0.218276,0.097428,0.013339,0.499380,-0.255022,0.053970,0.406006,0.081746,0.231277
19248,ZZEF1,-0.262803,-0.498579,-0.899706,0.152490,-0.698355,-0.438409,-0.762258,-0.633121,-0.418792,...,-0.106382,-0.026728,-0.174378,-0.085285,1.223314,0.316755,-0.083924,0.051327,-0.501601,-2.225576


In [8]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="GeneSym")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.FLOAT,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

2919


In [9]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [10]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [11]:
with open("gtextissuesample.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline