# HuBMAP ASCT+B Annotations Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [16]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [17]:
name='CM4AI U2OS Cell Map Protein Localization Assemblies'
description="Protein localization assemblies constructed from integrating AP-MS biomolecular interaction and IF imaging data"

In [18]:
short_citation = 'B√∂rner, bioRxiv, 2024'
title = 'Human BioMolecular Atlas Program (HuBMAP): 3D Human Reference Atlas Construction and Usage'
author = 'B√∂rner, K'
journal = 'bioRxiv'
year = 2024
volume = 2024
pages = '03.27.587041'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [19]:
creators = [
    mlc.Organization(name="The Human BioMolecular Atlas Program", url="https://hubmapconsortium.org/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "http://creativecommons.org/licenses/by-nc-sa/4.0"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/HuBMAP+ASCT$plus$B+Annotations"
date_published=datetime.date(2024, 12, 26)

### Resource

In [20]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [21]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/asctb/gene_attribute_matrix.txt.gz'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/gzip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [22]:
matrix = pd.read_csv(file_url, sep='\t', compression='gzip')
display(matrix)

Unnamed: 0,Gene,Blood vasculature_CT1_blood vessel endothelial cell,Blood vasculature_CT1_cardiac endothelial cell,Blood vasculature_CT1_fibroblast,Blood vasculature_CT1_vascular smooth muscle cell,Blood_CT2_CD14 monocyte,Blood_CT2_CD16 monocyte,Blood_CT2_CD4 T cell effector memory CD45RA,Blood_CT2_CD4 T cell memory,Blood_CT2_CD4 naive,...,Urinary bladder_CT2_detrusor smooth muscle cell of bladder,Urinary bladder_CT2_endothelial cell of bladder,Urinary bladder_CT2_intermediate cell of bladder urothelium,Urinary bladder_CT2_lipofibroblast cell of bladder,Urinary bladder_CT2_myofibroblast cell of bladder,Urinary bladder_CT2_pericyte cell of bladder,Urinary bladder_CT2_superficial cell of bladder urothelium,Urinary bladder_CT2_vascular smooth muscle cell of bladder,Venous blood vessel _CT1_blood vessel smooth muscle cell,Venous blood vessel _CT1_vein endothelial cell of respiratory system
0,A2M,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ABCA1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ABCA13,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ABCA3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ABCA4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1527,ZEB2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1528,ZFHX3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1529,ZIC1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1530,ZIC2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.INTEGER,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

1322


In [24]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [25]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [26]:
with open("hubmapasctb.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline