# LINCS Kinativ Kinase Inhibitor Bioactivity Profiles Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [2]:
name='LINCS Kinativ Kinase Inhibitor Bioactivity Profiles'
description="Percent inhibition of kinases by small molecules measured in cell lysates"

In [3]:
creators = [
    mlc.Organization(name="The Library of Integrated Network-Based Cellular Signatures ", url="https://lincsproject.org/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://lincsproject.org/LINCS/data/release-policy"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/LINCS+Kinativ+Kinase+Inhibitor+Bioactivity+Profiles"
date_published=datetime.date(2015, 4, 6)

### Resource

In [4]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [5]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/kinativ/gene_attribute_matrix_standardized.txt.zip'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/zip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [6]:
matrix = pd.read_csv(file_url, sep='\t', compression='zip')
display(matrix)

Unnamed: 0,GeneSym,BX795_NGP,CH5424802_HeLa,DCC-2036_Ramos (RA 1),HG-6-64-01_A-375,Ibrutinib_Ramos (RA 1),JWE-035_HeLa,JWE-035_PC-3,NPK76-II-72-1_HeLa,NPK76-II-72-1_PC-3,...,XMD11-50_HeLa,XMD11-50_PC-3,XMD11-85h_HeLa,XMD11-85h_PC-3,XMD16-144_HeLa,XMD16-144_PC-3,XMD8-85_HeLa,XMD8-85_PC-3,XMD8-92_HeLa,XMD8-92_PC-3
0,ABL1,-0.275715,-0.003411,-0.013058,0.479351,-0.061168,0.445118,-0.013058,-0.145288,-0.013058,...,-0.023683,-0.013058,-0.179513,-0.013058,0.397780,-0.013058,-0.318844,-0.013058,-0.133941,-0.013058
1,ADRBK1,0.033496,-0.013058,0.027273,-0.013058,-0.016824,0.003169,-0.013058,0.023040,-0.013058,...,0.062498,-0.013058,-0.155862,-0.013058,-0.006726,-0.013058,-0.103744,-0.013058,0.021510,-0.013058
2,AKT1,-0.013058,-0.013058,-0.013058,0.000974,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,...,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058
3,AKT2,-0.013058,-0.038101,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,...,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058
4,AKT3,-0.013058,-0.038101,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,...,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058,-0.013058
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227,WNK3,0.110463,-0.082268,-0.013058,-0.064073,-0.013058,0.026886,-0.013058,-0.049302,-0.013058,...,0.015063,-0.013058,-0.025992,-0.013058,0.011686,-0.013058,-0.084855,-0.013058,-0.053771,-0.013058
228,WNK4,-0.013058,-0.008579,-0.013058,-0.010813,-0.013058,-0.013058,-0.038893,-0.013058,-0.015880,...,-0.013058,0.005007,-0.013058,-0.080335,-0.013058,0.042184,-0.013058,-0.040282,-0.013058,-0.000302
229,YES1,0.028046,0.056731,0.149927,0.338482,0.015817,-0.013058,0.161507,-0.013058,-0.210208,...,-0.013058,0.013559,-0.013058,0.011686,-0.013058,0.295609,-0.013058,0.009821,-0.013058,0.019985
230,ZAK,0.179242,-0.024067,0.378267,0.432609,0.099753,-0.034162,0.033888,-0.145288,-0.057456,...,0.018465,-0.001945,-0.177012,0.018465,0.051836,0.090536,-0.058279,-0.011934,-0.016069,0.027659


In [7]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.FLOAT,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

27


In [8]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [9]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())

  -  [Metadata(LINCS Kinativ Kinase Inhibitor Bioactivity Profiles)] Property "http://mlcommons.org/croissant/citeAs" is recommended, but does not exist.


In [10]:
with open("lincskinativ.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline