# CM4AI U2OS Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [2]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [6]:
name='GTEx Tissue-Specific Aging Signatures'
description="Tissue-specific aging signatures createed from GTEx RNA-seq gene expression profiles"

In [7]:
short_citation = 'GTEx Consortium, Science, 2020'
title = 'The GTEx Consortium atlas of genetic regulatory effects across hman tissues'
author = 'GTEx Consortium'
journal = 'Science'
year = 2020
volume = 369
pages = '1318-1330'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [8]:
creators = [
    mlc.Organization(name="GTEx", url="https://gtexportal.org/home/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://gtexportal.org/home/license"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/GTEx+Tissue-Specific+Aging+Signatures"
date_published=datetime.date(2024, 12, 6)

### Resource

In [9]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [20]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/gtexagingsignatures/gene_attribute_matrix_standardized.txt.gz'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/zip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [21]:
matrix = pd.read_csv(file_url, sep='\t', compression='gzip')
display(matrix)

Unnamed: 0,gene_symbol,GTEx AdiposeTissue 20-29 vs 30-39,GTEx AdiposeTissue 20-29 vs 40-49,GTEx AdiposeTissue 20-29 vs 50-59,GTEx AdiposeTissue 20-29 vs 60-69,GTEx AdiposeTissue 20-29 vs 70-79,GTEx AdrenalGland 20-29 vs 30-39,GTEx AdrenalGland 20-29 vs 40-49,GTEx AdrenalGland 20-29 vs 50-59,GTEx AdrenalGland 20-29 vs 60-69,...,GTEx Thyroid 20-29 vs 70-79,GTEx Uterus 20-29 vs 30-39,GTEx Uterus 20-29 vs 40-49,GTEx Uterus 20-29 vs 50-59,GTEx Uterus 20-29 vs 60-69,GTEx Vagina 20-29 vs 30-39,GTEx Vagina 20-29 vs 40-49,GTEx Vagina 20-29 vs 50-59,GTEx Vagina 20-29 vs 60-69,GTEx Vagina 20-29 vs 70-79
0,A1BG,3.366056,2.621722,6.480054,9.498323,6.373474,-0.054380,0.000518,-0.025554,0.041882,...,3.831461,0.00001,-0.003610,0.024576,1.080219,-0.000019,0.319386,0.000351,0.252060,0.589632
1,A1BG-AS1,1.336302,0.901327,0.996287,2.861223,1.974662,-0.027725,0.000518,0.000533,-0.108658,...,1.507318,0.00001,0.167571,-1.602299,-5.353952,-0.000019,0.336304,0.000351,0.232804,0.158252
2,A2M,1.012793,0.406450,1.277877,2.269890,-0.135403,0.110478,-0.000518,0.055992,0.069557,...,-0.354329,0.00001,0.199208,-0.175037,-0.933719,0.000019,0.187902,0.000351,1.123160,0.244233
3,A2M-AS1,1.760969,2.684223,5.040628,8.304753,2.202745,-0.027725,-0.000518,-0.001355,-0.132124,...,0.789560,0.00001,0.029283,1.576347,2.839745,0.000019,0.342840,0.000351,3.332446,1.208515
4,A4GALT,0.712483,1.432749,4.540573,2.904208,0.814431,-0.008907,0.000483,0.083427,0.246593,...,0.805844,-0.00001,0.444423,-0.400248,-0.002270,-0.000019,-0.127840,0.000351,-0.099056,0.252528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13514,ZXDC,0.136033,-0.802989,-0.212719,-1.074036,-1.233650,-0.114086,-0.219694,-0.433013,-0.275667,...,-0.372410,-0.00001,-0.546186,-0.043856,0.994810,0.000019,-0.337626,-0.000351,-1.236718,-0.167540
13515,ZYG11B,-0.090090,0.868708,0.624495,0.700344,-0.092907,0.036965,-0.000518,-0.008171,-1.210179,...,0.006753,-0.00001,0.028182,-0.381969,-1.880197,0.000019,0.176102,-0.000351,1.915612,1.276734
13516,ZYX,-0.525900,-1.256519,-0.595089,-0.579116,-0.374565,-0.009256,0.000518,0.093367,0.233675,...,0.579463,-0.00001,0.254694,-3.595524,-11.776127,0.000019,-0.106554,-0.000351,-0.886813,0.465678
13517,ZZEF1,0.190992,-0.792088,-1.347324,-0.621699,-1.288655,0.017267,0.000518,0.005172,0.106631,...,-0.355558,-0.00001,-0.028072,0.022375,-0.622891,0.000019,-0.025446,-0.000351,0.601777,-0.630121


In [22]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="gene_symbol")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.FLOAT,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

136


In [23]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [24]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [None]:
with open("gtexagingsignatures.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline