# LINCS L1000 CMAP CRISPR Knockout Consensus Signatures Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [2]:
name='LINCS L1000 CMAP CRISPR Knockout Consensus Signatures'
description="Gene association consensus signatures following CRISPR gene knockout"

In [3]:
short_citation = 'Evangelista, Nucleic Acids Res, 2022'
title = 'SigCom LINCS: data and metadata search engine for a million gene expression signatures'
author = 'Evangelista, JE'
journal = 'Nucleic Acids Res'
year = 2022
volume = 50
pages = 'W697-W709'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="The Library of Integrated Network-Based Cellular Signatures ", url="https://lincsproject.org/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://lincsproject.org/LINCS/data/release-policy"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/LINCS+L1000+CMAP+CRISPR+Knockout+Consensus+Signatures"
date_published=datetime.date(2023, 9, 5)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/l1000crispr/gene_attribute_matrix_standardized.txt.gz'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/gzip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='gzip')
display(matrix)

Unnamed: 0,Gene,A1BG,A1CF,A2M,AADACL2,AADAT,AAK1,AAMP,AASS,AATK,...,ZNF646,ZNF668,ZNF683,ZNF80,ZNF831,ZNRD2,ZNRF3,ZPBP,ZPBP2,ZWINT
0,A1CF,-0.002343,0.000133,-0.000446,0.004149,-0.000664,0.001079,-0.003183,0.000951,-0.002316,...,1.150108e-03,0.001450,0.001017,-0.000544,-0.000214,-0.002032,0.002819,-0.001489,-0.001902,-0.000855
1,A2M,-0.008798,-0.002118,0.000301,0.008625,-0.002525,0.001618,-0.003352,-0.001910,-0.003653,...,-4.726684e-03,0.004265,0.003752,0.002595,-0.005815,-0.006310,0.002501,-0.001742,0.006032,-0.001882
2,A4GALT,0.000856,-0.001183,-0.000939,-0.000811,0.000631,-0.000292,-0.000715,-0.000138,-0.000005,...,-9.950775e-07,-0.001491,-0.000487,-0.000368,-0.000825,-0.001824,0.001074,-0.001376,0.002395,-0.000878
3,A4GNT,0.000976,0.001011,0.000855,0.000791,-0.000187,0.001431,-0.000339,0.000157,-0.000327,...,-4.863465e-04,0.000304,0.001015,-0.000833,-0.000256,0.000585,-0.000297,0.000795,0.000428,0.000051
4,AAAS,0.001380,0.000129,0.001421,0.001803,0.001039,0.001388,-0.002070,-0.001372,-0.001970,...,3.991222e-03,0.000740,0.002452,-0.001150,-0.000420,0.000214,-0.001089,-0.000543,0.001598,-0.000055
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12121,ZXDB,0.000540,0.000451,0.000691,0.000302,0.000914,-0.000407,0.000916,0.000931,0.000605,...,-1.557152e-03,-0.000176,-0.000156,0.001083,-0.000446,0.000681,0.002135,0.000843,-0.000717,-0.000504
12122,ZXDC,0.003487,-0.000807,-0.001937,0.000681,-0.000613,-0.000736,0.000523,-0.000929,-0.001271,...,2.354290e-03,0.000945,-0.002075,0.000515,-0.000899,0.000743,-0.000647,-0.000419,-0.002936,0.001403
12123,ZYX,-0.000065,0.001943,0.000617,0.005519,0.002654,-0.001830,-0.002095,0.001002,-0.001571,...,-1.475416e-03,-0.001339,0.001660,0.003981,0.001593,0.003006,-0.003611,0.002687,-0.000219,-0.000048
12124,ZZEF1,-0.000605,-0.001808,0.001406,-0.001591,-0.001241,0.000452,0.003939,-0.000911,0.002606,...,2.003018e-03,-0.001284,-0.002375,-0.003561,-0.004013,0.001402,-0.003826,-0.005081,0.001325,0.001966


In [8]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.FLOAT,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

5105


In [9]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [10]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [11]:
with open("lincsl1000crisprconsensus.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline