# LINCS L1000 CMAP Chemical Perturbation Consensus Signatures Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [2]:
name='LINCS L1000 CMAP Chemical Perturbation Consensus Signatures'
description="Gene association consensus signatures following chemical perturbation"

In [3]:
short_citation = 'Evangelista, Nucleic Acids Res, 2022'
title = 'SigCom LINCS: data and metadata search engine for a million gene expression signatures'
author = 'Evangelista, JE'
journal = 'Nucleic Acids Res'
year = 2022
volume = 50
pages = 'W697-W709'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="The Library of Integrated Network-Based Cellular Signatures ", url="https://lincsproject.org/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://lincsproject.org/LINCS/data/release-policy"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/LINCS+L1000+CMAP+Chemical+Perturbation+Consensus+Signatures"
date_published=datetime.date(2023, 10, 27)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/l1000chempert/gene_attribute_matrix_standardized.txt.gz'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/gzip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='gzip')
display(matrix)

Unnamed: 0,Gene,afatinib,erlotinib,neratinib,lapatinib,pazopanib,HMN-214,fulvestrant,vorinostat,mitoxantrone,...,ICI-174864,antalarmin,CI-994,DAC-3,943,949,SRT-3657,compe,C646,saha
0,A1CF,0.328901,0.208528,0.149943,0.029026,0.001270,0.456000,0.002512,0.223298,0.122517,...,0.010235,0.017238,0.419356,-0.094539,0.073399,-0.020302,-0.051255,0.392735,-0.883001,0.744619
1,A2M,-0.088758,0.064381,-0.238405,0.107817,0.157477,-0.024835,0.148473,-0.173339,-0.130047,...,-0.313662,0.152480,0.910623,0.431119,-0.204038,-0.406836,-0.023175,0.226324,-0.157330,0.531475
2,A4GALT,-0.140883,-0.239096,-0.069071,0.023087,-0.141796,-0.089207,-0.130126,0.091296,0.242389,...,0.812422,0.682623,0.300974,-0.635747,0.147656,0.330006,-0.439392,0.022643,-1.090465,1.651163
3,A4GNT,0.188614,-0.006162,0.026468,-0.175782,-0.139949,-0.166020,-0.207038,-0.490050,-0.095327,...,0.947948,0.384218,-0.345334,0.590778,-0.055246,-0.325932,-0.948995,-0.308855,0.559912,-1.229039
4,AAAS,-0.076077,-0.025207,-0.304972,0.062615,-0.042386,0.025247,-0.089120,0.170582,0.257598,...,1.262607,0.631737,-1.486842,-0.246083,-0.243602,0.399528,0.366060,0.049652,-0.555752,0.676050
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12122,ZXDB,-0.170131,-0.074527,-0.005495,-0.009645,-0.025095,0.349328,-0.025503,-0.409873,-0.019554,...,-1.858385,-0.914796,-1.587304,0.758755,0.038687,-0.170786,-0.119987,0.359791,0.368366,0.811091
12123,ZXDC,-0.021357,0.295761,0.055963,0.083330,-0.026343,0.180142,0.161431,-0.440406,0.131583,...,-0.178705,0.196659,-0.228942,-0.123306,-0.369615,0.608931,-0.195139,0.425004,-0.061930,-0.393016
12124,ZYX,-0.177777,-0.151027,-0.150141,-0.104840,-0.078424,0.248666,-0.076810,-0.368883,0.004185,...,0.123693,0.053829,-0.091249,-0.073308,-0.229728,-0.391841,0.534675,0.002690,0.699235,-0.491933
12125,ZZEF1,-0.037664,0.119073,-0.107402,-0.053899,-0.227155,0.358892,0.072211,0.045461,-0.132558,...,0.194284,-0.122994,-1.509408,0.171970,0.482573,-0.513789,0.158866,0.434732,0.016579,0.724406


In [8]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.FLOAT,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

33610


In [9]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [10]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [11]:
with open("lincsl1000chempertconsensus.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline