# RummaGEO Gene Perturbations Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from Attribute Table

In [2]:
name='RummaGEO Gene Perturbation Signatures'
description="Single gene perturbation signatures produced by querying RummaGEO metadata for knockouts, knockdowns, and over-expression conditions."

In [3]:
short_citation = 'Marino, Patterns, 2024'
title = 'RummaGEO: Automatic mining of human and mouse gene sets from GEO'
author = 'Marino, GB'
journal = 'Patterns'
year = 2024
volume = 52
pages = '101072'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://rummageo.com/about"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/RummaGEO+Gene+Perturbation+Signatures"
date_published=datetime.date(2025, 6, 10)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/rummageogene/gene_attribute_matrix.txt.zip'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/zip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='zip')
display(matrix)

Unnamed: 0,Gene,GSE100102_0_v_3_cgas_mouse,GSE100102_0_v_5_cgas_mouse,GSE100102_0_v_6_cgas_mouse,GSE100102_2_v_3_cgas_mouse,GSE100102_2_v_5_cgas_mouse,GSE100102_2_v_6_cgas_mouse,GSE100102_4_v_3_cgas_mouse,GSE100102_4_v_5_cgas_mouse,GSE100102_4_v_6_cgas_mouse,...,GSE98898_7_v_2_hoxa1_human,GSE98898_8_v_1_hoxa1_human,GSE98964_0_v_2_tet2_mouse,GSE98964_0_v_4_tet2_mouse,GSE98964_1_v_2_tet2_mouse,GSE98964_1_v_4_tet2_mouse,GSE99112_0_v_1_upf3b_mouse,GSE99687_0_v_1_smad5_human,GSE99687_2_v_1_smad5_human,GSE99687_3_v_1_smad5_human
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1CF,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A2M,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A2ML1,0,0,0,0,0,0,0,0,0,...,0,-1,0,0,0,0,0,-1,0,0
4,A3GALT2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19125,ZYG11A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,-1,0,0
19126,ZYG11B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19127,ZYX,-1,-1,-1,0,-1,-1,-1,-1,-1,...,0,1,1,0,0,0,0,0,0,0
19128,ZZEF1,-1,-1,-1,0,0,-1,-1,-1,-1,...,0,0,0,0,0,0,0,0,0,0


We'll also incorporate metadata available for each field from a file adapted from the meta files available on the [RummaGEO download page](https://rummageo.com/download).

In [10]:
pert_meta = pd.read_csv('https://maayanlab.cloud/static/hdfs/harmonizome/data/rummageogene/gene_perts_meta.txt.gz', compression='gzip', sep='\t', index_col='label')
pert_meta.index = pert_meta.index.map(lambda x: '_'.join(x.split('_')[:-1]))
pert_meta = pert_meta.groupby(level=0).first()
pert_meta

Unnamed: 0_level_0,id,title,term,direction,condition_1,condition_2,is_control_1,is_control_2,status,search_term
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"GSE100075,GSE100076_0_v_3_esr1_human",1d7b0b05-e984-41f9-aa76-75b63f4bbb6d,Discovery of naturally occurring ESR1 mutation...,"GSE100075,GSE100076-0-vs-3-human",up,sum44 wt breast cancer cell line biomaterial p...,sum44 lted breast cancer cell line biomaterial...,True,False,signature,esr1
"GSE100075,GSE100076_0_v_4_esr1_human",5bd4b974-c7da-4dfa-9fbd-752e580b919c,Discovery of naturally occurring ESR1 mutation...,"GSE100075,GSE100076-0-vs-4-human",up,sum44 wt breast cancer cell line biomaterial p...,mcf7 lted esr1 y537c breast cancer cell line b...,True,False,signature,esr1
"GSE100075,GSE100076_1_v_3_esr1_human",8e20b526-2b3a-4f18-bb41-a21b2f0cc404,Discovery of naturally occurring ESR1 mutation...,"GSE100075,GSE100076-1-vs-3-human",dn,mcf7 lted esr1 wt breast cancer cell line biom...,sum44 lted breast cancer cell line biomaterial...,True,False,signature,esr1
"GSE100075,GSE100076_1_v_4_esr1_human",458343a6-0446-45aa-a7fe-cf425535019b,Discovery of naturally occurring ESR1 mutation...,"GSE100075,GSE100076-1-vs-4-human",dn,mcf7 lted esr1 wt breast cancer cell line biom...,mcf7 lted esr1 y537c breast cancer cell line b...,True,False,signature,esr1
"GSE100075,GSE100076_2_v_3_esr1_human",32213c4a-7898-4f95-b32d-3da3beac284d,Discovery of naturally occurring ESR1 mutation...,"GSE100075,GSE100076-2-vs-3-human",up,mcf7 wt breast cancer cell line biomaterial pr...,sum44 lted breast cancer cell line biomaterial...,True,False,signature,esr1
...,...,...,...,...,...,...,...,...,...,...
"GSE99973,GSE99978_4_v_1_xpo5_mouse",2410b264-659b-4145-a2fd-75e4da410604,Thiol-linked alkylation for the metabolic sequ...,"GSE99973,GSE99978-4-vs-1-mouse.tsv",dn,mesc pulse mouse embryonic stem (mes) cells /v...,mesc xpo5ko pulse mouse embryonic stem (mes) c...,True,False,signature,xpo5
"GSE99973,GSE99978_4_v_3_xpo5_mouse",0c64fefe-f1a1-44d3-814b-ad48a2cc9892,Thiol-linked alkylation for the metabolic sequ...,"GSE99973,GSE99978-4-vs-3-mouse.tsv",dn,mesc pulse mouse embryonic stem (mes) cells /v...,mesc xpo5ko s4u mouse embryonic stem (mes) cel...,True,False,signature,xpo5
"GSE99974,GSE99978_0_v_1_mettl3_mouse",35267a48-af9a-4acd-b8e6-f6809978d143,Thiol-linked alkylation for the metabolic sequ...,"GSE99974,GSE99978-0-vs-1-mouse.tsv",dn,mesc pulse mouse embryonic stem (mes) cells /v...,mesc mettl3ko pulse mouse embryonic stem (mes)...,True,False,signature,mettl3
"GSE99974,GSE99978_0_v_3_mettl3_mouse",ab201488-6e0f-4371-85ac-41343a298cdd,Thiol-linked alkylation for the metabolic sequ...,"GSE99974,GSE99978-0-vs-3-mouse.tsv",dn,mesc pulse mouse embryonic stem (mes) cells /v...,mesc mettl3ko s4u mouse embryonic stem (mes) c...,True,False,signature,mettl3


In [11]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  c1 = pert_meta.loc[col, 'condition_1']
  c2 = pert_meta.loc[col, 'condition_2']
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      description=f'{c1} vs {c2}' if pert_meta.loc[col, 'status']=='signature' else f'{c2} vs {c1}',
      data_types=mlc.DataType.INTEGER,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

4413


In [12]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [13]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [14]:
with open("rummageogene.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline