In [1]:
import json
import pandas as pd
from glob import glob
import os

In [7]:
with open('downloads.json') as o:
	downloads = json.loads(o.read())

In [8]:
sab_dict = {
	"LINCS": ["LINCS"],
	"4DN": ["4DN"],
	"ERCC": ["ERCCRBP", "ERCCREG"],
	"GlyGen": ["PROTEOFORM", "GLYCANS"],
	"GTEx": ["GTEXEXP", "GTEXEQTL"],
	"HuBMAP": ["AZ", "HMAZ"],
	"IDG": ["IDGP", "IDGD"],
	"KF": ["KF"],
	"MoTrPAC": ["MOTRPAC"],
	"MW": ["MW"],
	"SPARC": ["NPO"],
	"CLINVAR": ["CLINVAR"],
	"HGNCHPO": ["HGNCHPO"],
	"MSIGDB": ["MSIGDB"],
	"HGNCUNIPROT": ["HGNCUNIPROT"],
	"HGNCENZ": ["HGNCENZ"],
	"ARCHS4": ["ARCHS4"]
}


In [9]:
desc = {}
for i in downloads:
	desc[i["source"]] = i["description"]

In [10]:
desc

{'HuBMAP': 'Tissue, cell-type and gene specific markers from single-cell data',
 'IDG': 'Relationships between compounds, diseases, and proteins',
 'GlyGen': 'Associations from multiple glycomics database',
 'HGNCENZ': 'Enzyme Genes',
 'GTEx': 'Expression and eQTL data from GTEx',
 'SPARC': 'The SPARC Knowledge base of the Automatic Nervous System (SCKAN)',
 'HGNCHPO': 'HGNC gene node mapping to Human Phenotype Ontology',
 'CLINVAR': 'Assertions between human genes and phenotypes',
 'MoTrPAC': 'Differential gene expression data from endurance training exercise of young adult rats',
 'HGNCUNIPROT': 'Gene-Protein relationships',
 'KF': 'Genotypic and phenotypic data from the Pediatric Cardiac Genetics Consortium cohort in Kids First',
 'LINCS': 'Gene expression data from drug perturbation, as well as drug-drug similarity',
 'MW': 'Metabolite relationships with gene, disease, and cell',
 'MSIGDB': 'Five subsets of MSigDB v7.4 datasets',
 'ERCC': 'eCLIP-seq and CHIP-seq associations from E

In [11]:
dcc_descriptions = {
	'LINCS': 'Gene expression data from drug perturbation, as well as drug-drug similarity',
	'4DN': 'Chromatin loops called from Hi-C experiments performed in select cell lines',
	'ERCC': 'eCLIP-seq and CHIP-seq associations from ERCC',
	'GlyGen': 'Associations from multiple glycomics database',
	'GTEx': 'Expression and eQTL data from GTEx',
	"HuBMAP": "Tissue, cell-type and gene specific markers from single-cell data",
	'IDG': 'Relationships between compounds, diseases, and proteins',
	'KF': 'Genotypic and phenotypic data from the Pediatric Cardiac Genetics Consortium cohort in Kids First',
	'MoTrPAC': 'Differential gene expression data from endurance training exercise of young adult rats',
	'MW': 'Metabolite relationships with gene, disease, and cell',
	"SPARC": "The SPARC Knowledge base of the Automatic Nervous System (SCKAN)",
	'CLINVAR': 'Assertions between human genes and phenotypes',
	'HGNCHPO': 'HGNC gene node mapping to Human Phenotype Ontology',
	'MSIGDB': 'Five subsets of MSigDB v7.4 datasets',
	"HGNCUNIPROT": "Gene-Protein relationships",
	"HGNCENZ": "Enzyme Genes",
	"ARCHS4": "Coexpression Matrix based on Human RNA-seq studies from GEO",
}

In [12]:
def get_file_size(filename):
    size = float(os.path.getsize(filename))
    if size > 1000000000:
        return "%.2fGB"%(size/1000000000)
    if size > 1000000:
        return "%.2fMB"%(size/1000000)
    if size > 1000:
        return "%.2fKB"%(size/1000)
    else: return "%d bytes"%int(size)

In [13]:
downloads[0]

{'source': 'HuBMAP',
 'description': 'Tissue, cell-type and gene specific markers from single-cell data',
 'url': 'https://s3.amazonaws.com/maayan-kg/dd-kg/042624/HuBMAP.zip',
 'size': '70.99KB',
 'updated': '04-24-2024',
 'terms': 843,
 'edges': 1234}

In [14]:
downloads = []
for filename in glob('out/compressed/*.zip'):
	dcc = filename.replace("out/compressed/", "").replace(".zip", "")
	meta = {
		"source": dcc,
		"description": dcc_descriptions[dcc],
		"url": "https://s3.amazonaws.com/maayan-kg/dd-kg/042624/%s.zip"%dcc,
		"size": get_file_size(filename),
		"updated": '04-24-2024',
	}
	nodes = set()
	edge_count = 0
	for sab in sab_dict[dcc]:
		if os.path.isfile("out/sab/%s.edges.csv"%sab):
			df = pd.read_csv("out/sab/%s.edges.csv"%sab)
			edge_count = edge_count + len(df.index)
			nodes = nodes.union(df.source).union(df.target)
		elif sab == 'HGNCENZ':
			df = pd.read_csv("out/sab/HGNCENZ.Enzyme.nodes.csv", index_col=0)
			nodes = set(df.index)
	meta["terms"] = len(nodes)
	if edge_count:
		meta["edges"] = edge_count
	downloads.append(meta)
	

In [15]:
downloads[0]

{'source': 'HuBMAP',
 'description': 'Tissue, cell-type and gene specific markers from single-cell data',
 'url': 'https://s3.amazonaws.com/maayan-kg/dd-kg/042624/HuBMAP.zip',
 'size': '70.99KB',
 'updated': '04-24-2024',
 'terms': 843,
 'edges': 1234}

In [16]:
with open('downloads.json', 'w') as o:
	o.write(json.dumps(downloads, indent=4))