In [40]:
import requests
import json
import time
from tqdm import tqdm
import pandas as pd

In [2]:
res = requests.get('https://maayanlab.cloud/speedrichr/api/datasetStatistics')
data = res.json()
cell_type_libraries = [i['libraryName'] for i in data["statistics"] if i['categoryId'] == 5]
len(cell_type_libraries)

27

In [46]:
cell_type_libraries = [
 'ESCAPE',
 'Allen_Brain_Atlas_10x_scRNA_2021',
 'Descartes_Cell_Types_and_Tissue_2021',
 'HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression',
 'CellMarker_Augmented_2021',
 'PanglaoDB_Augmented_2021',
 'Azimuth_Cell_Types_2021',
 'HuBMAP_ASCTplusB_augmented_2022',
 'Tabula_Sapiens',
 'MAGNET_2023',
 'Azimuth_2023',
 'CellMarker_2024',
 'Jensen TISSUES'
 ]

In [22]:
def term_search(term):
	time.sleep(0.3)
	res = requests.get(f'https://maayanlab.cloud/Enrichr/termmap?meta={term}&json=true')
	if not res.ok:
		raise Exception(res.text)
	else:
		results = {}
		for k,v in res.json()['terms'].items():
			if k in cell_type_libraries:
				results[k] = v
		return results

In [47]:
cell_type_dict = {}
cell_type_dict_review = {}
with open('data/list') as o:
	organ_groups = o.read().split("\n\n")
	for organ_group in tqdm(organ_groups):
		organs, *cell_types = organ_group.split("\n")
		cell_type_dict[organs] = {}
		organ_list = organs.split(", ")
		for organ in organ_list:
			results = term_search(organ)
			for k,v in results.items():
				if k not in cell_type_dict[organs]:
					cell_type_dict[organs][k] = []
				cell_type_dict[organs][k] = [*cell_type_dict[organs][k], *v]
		cell_type_dict_review[organs] = {}
		for cell_type in cell_types:
			cell_type = cell_type.replace(" cells", "")
			results = term_search(cell_type)
			for k,v in results.items():
				new_v = []
				for sig in v:
					for o in organ_list:
						if o.lower() in sig.lower():
							break
					else:
						new_v.append(sig)
				if len(new_v):
					if k not in cell_type_dict_review[organs]:
						cell_type_dict_review[organs][k] = []
					cell_type_dict_review[organs][k] = [*cell_type_dict_review[organs][k], *new_v]
					


100%|██████████| 14/14 [01:03<00:00,  4.56s/it]


In [7]:
for organs, libraries in cell_type_dict.items():
	for library, cell_types in cell_type_dict[organs].items():
		cell_type_dict[organs][library] = list(set(cell_types))

for organs, libraries in cell_type_dict_review.items():
	for library, cell_types in cell_type_dict_review[organs].items():
		cell_type_dict_review[organs][library] = list(set(cell_types))

In [8]:
with open("out/cell_types.json", "w") as o:
	o.write(json.dumps(cell_type_dict, indent=4))

In [9]:
with open("out/cell_types_review.json", "w") as o:
	o.write(json.dumps(cell_type_dict_review, indent=4))

In [24]:
with open("reviewed/cell_types.json") as o:
	cell_types = json.loads(o.read())

with open("reviewed/cell_types_review.json") as o:
	cell_types_review = json.loads(o.read())

In [36]:
merged = {}
for organ in cell_types.keys():
	if organ in cell_types_review:
		merged[organ] = {}
		for library in cell_type_libraries:
			if library in cell_types[organ] and library in cell_types_review[organ]:
				merged[organ][library] = list(set([*cell_types[organ][library], *cell_types_review[organ][library]]))
			elif library in cell_types[organ]:
				merged[organ][library] = cell_types[organ][library]
			elif library in cell_types_review[organ]:
				merged[organ][library] = cell_types_review[organ][library]
	else:

		merged[organ] = {}
		for library in cell_type_libraries:
			if library in cell_types[organ]:
				merged[organ][library] = cell_types[organ][library]

In [37]:
with open("out/combined.json", "w") as o:
	o.write(json.dumps(merged, indent=4))

In [38]:
merged.keys()

dict_keys(['Liver', 'Heart', 'Kidney', 'Skin', 'Bone, cartilage', 'Pancreas', 'Blood', 'Brain, nervous system', 'Testes', 'Ovary', 'Bone marrow', 'Eye, Nose, Ear', 'Lung', 'Intestine'])

In [45]:
for k,v in merged.items():
	l = []
	for key, val in v.items():
		if len(l):
			l = [*l, "-----", key,  *val]
		else:
			l = [key, *val]
	pd.DataFrame(l, columns=[k]).to_csv("out/tissues/%s.tsv"%k, index=None)

In [48]:
from glob import glob

In [49]:
def fetch_geneset(label, library):
	res = requests.get(f'https://maayanlab.cloud/Enrichr/geneSetLibrary?libraryName={library}&mode=json&term={label}')
	if not res.ok:
		raise Exception(res.text)
	else:
		return res.json()

In [75]:
with open('out/cell_atlas.gmt', 'w') as o:
	for filename in tqdm(glob('data/mapped/*.tsv')):
		tissue = filename.split("/")[-1].replace(".tsv", "").title()
		df = pd.read_csv(filename, sep="\t")
		for i, row in df[~df['Mapped Term'].isna()].iterrows():
			cell_type = row['Cell Type']
			label = row['Mapped Term']
			library = row['Library']
			d = fetch_geneset(label, library)
			geneset = d[label]
			o.write("\t".join(["%s:%s"%(tissue, cell_type), "%s:%s"%(library, label), * geneset]) + "\n")
			time.sleep(0.3)


100%|██████████| 14/14 [00:46<00:00,  3.34s/it]


In [68]:
label

'Sinusoidal Endothelial Cell Liver Human Liver Human'

In [1]:
libs = set()
with open('out/cell_atlas_from_enrichr.gmt') as o:
	for line in o:
		label, lib, *g = line.split("\t")
		l = lib.split(":")[0]
		libs.add(l)

In [2]:
libs

{'Azimuth_Cell_Types_2021',
 'CellMarker_2024',
 'CellMarker_Augmented_2021',
 'Descartes_Cell_Types_and_Tissue_2021',
 'HuBMAP_ASCT_plus_B_augmented_w_RNAseq_Coexpression',
 'HuBMAP_ASCTplusB_augmented_2022',
 'Jensen_TISSUES',
 'PanglaoDB_Augmented_2021',
 'Tabula_Sapiens'}