# IMPC Knckout Mouse Phenotypes Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from CFDE Attribute Table

In [2]:
name='IMPC Knockout Mouse Phenotypes'
description="Observed phenotypes of mice following gene knockout"

In [3]:
short_citation = 'Groza, Nucleic Acids Res, 2023'
title = 'The International Mouse Phenotyping Consortium: comprehensive knockout phenotyping underpinning the study of human disease'
author = 'Groza, T'
journal = 'Nucleic Acids Res'
year = 2023
volume = 51
pages = 'D1038-D1045'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="International Mouse Phenotyping Consortium", url="https://www.mousephenotype.org/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://creativecommons.org/licenses/by/4.0/"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/IMPC+Knockout+Mouse+Phenotypes"
date_published=datetime.date(2023, 10, 18)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/komp/gene_attribute_matrix.txt.gz'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/gzip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='gzip')
display(matrix)

Unnamed: 0,Gene,abnormal QT variability,abnormal abdominal wall morphology,abnormal adrenal gland morphology,abnormal allantois morphology,abnormal auditory brainstem response,abnormal autopod morphology,abnormal behavior,abnormal blood urea nitrogen level,abnormal blood uric acid level,...,thin ventricular wall,thrombocytopenia,thrombocytosis,tremors,trunk curl,unresponsive to tactile stimuli,urinary bladder obstruction,vertebral fusion,vertebral transformation,wrinkled skin
0,A1CF,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,AACS,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AADAC,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,AAK1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,AAMP,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6758,ZSWIM6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6759,ZWILCH,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6760,ZWINT,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6761,ZYG11B,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      data_types=mlc.DataType.INTEGER,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

668


In [9]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [10]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [11]:
with open("impckomp2.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline