# RummaGEO Drug Perturbations Croissant Builder ü•ê

Author: Ido Diamant, Ma'ayan Lab, CFDE DRC

In [1]:
import datetime
import hashlib
import json
import mlcroissant as mlc
import os
import pandas as pd
import requests
from zipfile import ZipFile

## Construct Croissant from Attribute Table

In [2]:
name='RummaGEO Drug Perturbation Signatures'
description="Drug perturbation signatures produced from automatically mined RNA-seq samples from GEO."

In [3]:
short_citation = 'Marino, Patterns, 2024'
title = 'RummaGEO: Automatic mining of human and mouse gene sets from GEO'
author = 'Marino, GB'
journal = 'Patterns'
year = 2024
volume = 52
pages = '101072'

cite_as=(f'@article{{{short_citation}, title={{{title}}}, author={{{author}}}, journal={{{journal}}}, year={{{year}}}, volume={{{volume}}}, pages={{{pages}}}}}')

In [4]:
creators = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

publishers = [
    mlc.Organization(name="Ma'ayan Lab", url="https://maayanlab.cloud/")
]

license = "https://rummageo.com/about"
version = "0.1.0"
url="https://maayanlab.cloud/Harmonizome/dataset/RummaGEO+Drug+Perturbation+Signatures"
date_published=datetime.date(2025, 6, 10)

### Resource

In [5]:
# Utility function to generate a SHA256 checksum for a FileObject from a URL
def get_sha256(url):
    sha256 = hashlib.sha256()
    response = requests.get(url, stream=True)
    for chunk in response.iter_content(chunk_size=65536):
        sha256.update(chunk)
    return sha256.hexdigest()

In [6]:
file_url = 'https://maayanlab.cloud/static/hdfs/harmonizome/data/rummageochem/gene_attribute_matrix.txt.zip'

distribution = [
    mlc.FileObject(
        id="dataset-attribute-table-archive",
        name="dataset-attribute-table-archive",
        description="Dataset attribute table archive from Harmonizome.",
        content_url=file_url,
        encoding_formats="application/zip",
        sha256=get_sha256(file_url)
    ),
    mlc.FileObject(
        id="dataset-attribute-table",
        name="dataset-attribute-table",
        description="Dataset attribute table from Harmonizome.",
        content_url="gene_attribute_matrix.txt",
        encoding_formats="text/tab-separated-values",
        contained_in={"dataset-attribute-table-archive"}
    )
]

### Structure/Semantics

In [7]:
matrix = pd.read_csv(file_url, sep='\t', compression='zip')
display(matrix)

Unnamed: 0,Gene,GSE100217_1_v_0_ethanol_mouse,GSE100217_1_v_3_ethanol_mouse,GSE100217_2_v_0_ethanol_mouse,GSE100217_2_v_3_ethanol_mouse,GSE100293_0_v_3_apelin_mouse,GSE100676_1_v_0_nitrate_human,GSE100676_1_v_2_nitrate_human,GSE101112_0_v_2_j147_mouse,GSE101112_0_v_4_j147_mouse,...,GSE96649_1_v_0_dexamethasone_human,GSE96649_1_v_2_dexamethasone_human,GSE96649_1_v_3_dexamethasone_human,GSE97352_1_v_2_dasatinib_human,GSE97352_3_v_2_dasatinib_human,GSE98973_3_v_0_sorafenib_mouse,GSE98973_3_v_1_sorafenib_mouse,GSE98973_3_v_2_sorafenib_mouse,GSE99187_1_v_0_gemcitabine_mouse,GSE99875_2_v_0_everolimus_human
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,A1CF,0,0,0,-1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A2M,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,A2ML1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,A3GALT2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19119,ZYG11A,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19120,ZYG11B,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19121,ZYX,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,1
19122,ZZEF1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


We'll also incorporate metadata available for each field from a file adapted from the meta files available on the [RummaGEO download page](https://rummageo.com/download).

In [8]:
pert_meta = pd.read_csv('https://maayanlab.cloud/static/hdfs/harmonizome/data/rummageochem/drug_perts_meta.txt.gz', compression='gzip', sep='\t', index_col='label')
pert_meta.index = pert_meta.index.map(lambda x: '_'.join(x.split('_')[:-1]))
pert_meta = pert_meta.groupby(level=0).first()
pert_meta

Unnamed: 0_level_0,id,title,term,direction,condition_1,condition_2,is_control_1,is_control_2,status,search_term
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
GSE100217_1_v_0_ethanol_mouse,696193ad-45bf-4c71-aa58-0f57d7feeb33,Heterogeneity of p53 dependent genomic respons...,GSE100217-1-vs-0-mouse.tsv,up,strain/background c57bl/6 /variation wt brain ...,strain/background c57bl/6 /variation p53 ko br...,True,False,signature,ethanol
GSE100217_1_v_3_ethanol_mouse,586ac9e9-e1bb-4be5-ae70-e0e6a15b19b9,Heterogeneity of p53 dependent genomic respons...,GSE100217-1-vs-3-mouse.tsv,up,strain/background c57bl/6 /variation wt brain ...,strain/background c57bl/6 /variation p53 ko br...,True,False,signature,ethanol
GSE100217_2_v_0_ethanol_mouse,35643833-624e-456e-9305-52d48151e51c,Heterogeneity of p53 dependent genomic respons...,GSE100217-2-vs-0-mouse.tsv,up,strain/background c57bl/6 /variation wt brain ...,strain/background c57bl/6 /variation p53 ko br...,True,False,signature,ethanol
GSE100217_2_v_3_ethanol_mouse,63f7a162-be9f-48be-9059-7f640288ad62,Heterogeneity of p53 dependent genomic respons...,GSE100217-2-vs-3-mouse.tsv,up,strain/background c57bl/6 /variation wt brain ...,strain/background c57bl/6 /variation p53 ko br...,True,False,signature,ethanol
GSE100293_0_v_3_apelin_mouse,65a089cd-f360-42b0-ab27-5c1f506b35ff,Apelin absence in endothelial cells,GSE100293-0-vs-3-mouse.tsv,dn,apelin wt rep strain c57bl6/j wild type endoth...,apelin ko rep strain c57bl6/j apln / endotheli...,True,False,signature,apelin
...,...,...,...,...,...,...,...,...,...,...
GSE98973_3_v_1_sorafenib_mouse,8c91aa9b-3df1-45c7-91a9-7bfc5a04358a,Kinome and transcriptome profiling reveal broa...,GSE98973-3-vs-1-mouse.tsv,dn,control heart sex female strain fvb murine,sunitinib heart sex female strain fvb murine,True,False,signature,sorafenib
GSE98973_3_v_2_sorafenib_mouse,77a5630b-9942-435a-ae53-0ad9ad87a90f,Kinome and transcriptome profiling reveal broa...,GSE98973-3-vs-2-mouse.tsv,up,control heart sex female strain fvb murine,sorafenib heart sex female strain fvb murine,True,False,signature,sorafenib
GSE99187_1_v_0_gemcitabine_mouse,9b643a8a-d369-47ba-ab10-023db2b2b641,In vivo response of EpCAM+ pancreatic cancer c...,GSE99187-1-vs-0-mouse.tsv,dn,kpcl pcc g strain fvb/nj pancreas age around 4...,kpcl pcc ga strain fvb/nj pancreas age around ...,True,False,signature,gemcitabine
GSE99349_2_v_5_cocaine_human,a2befb39-f246-4f94-b82c-f25b09b1f95d,Gene Network Dysregulation in Dorsolateral Pre...,GSE99349-2-vs-5-human,up,neun dlpfc total rnaseq prefrontal cortex case...,neun dlpfc total rnaseq prefrontal cortex case...,True,False,signature,cocaine


In [9]:
fields = []
array_size = matrix.shape[0]

fields.append(
  mlc.Field(
    id="associations/gene",
    name="gene",
    description="The NCBI gene symbol",
    data_types=mlc.DataType.TEXT,
    is_array=True,
    array_shape=str(array_size),
    source=mlc.Source(
      file_object="dataset-attribute-table",
      extract=mlc.Extract(column="Gene")
    )
  )
)

for col in matrix.columns[1:]:
  c1 = pert_meta.loc[col, 'condition_1']
  c2 = pert_meta.loc[col, 'condition_2']
  fields.append(
    mlc.Field(
      id=f"associations/{col.replace(' ','_')}",
      name=f"associations/{col}",
      description=f'{c1} vs {c2}' if pert_meta.loc[col, 'status']=='signature' else f'{c2} vs {c1}',
      data_types=mlc.DataType.INTEGER,
      is_array=True,
      array_shape=str(array_size),
      source=mlc.Source(
        file_object="dataset-attribute-table",
        extract=mlc.Extract(column=col)
      )
    )
  )

print(len(fields))

2868


In [10]:
record_sets = [
    mlc.RecordSet(
        id="associations",
        name="associations",
        key='associations/gene',
        fields = fields
    )
]

### Assemble Dataset and Write JSON
Once we have defined each dataset layer, we can package the dataset metadata into a Croissant JSON-LD file.

In [11]:
metadata = mlc.Metadata(
    name = name,
    description = description,
    cite_as = cite_as,
    url = url,
    date_published=date_published,
    creators = creators,
    publisher = publishers,
    license = license,
    version = version,
    distribution = distribution,
    record_sets = record_sets
)

# Display any warnings/suggestions encountered by the validator when building the metadata
print(metadata.issues.report())




In [12]:
with open("rummageodrug.json", "w") as f:
  content = metadata.to_json()
  content = json.dumps(content, indent=4, default=str)
  f.write(content)
  f.write("\n")  # Terminate file with newline