In [1]:
import pandas as pd
import requests
import json
import re

In [2]:
rummageo_url = "https://rummageo.com/graphql"

In [3]:
def query_metadata(term, limit=10000):
    query = {"operationName":"TermSearch","variables":{"terms":term,"offset":0,"first":limit},"query":"query TermSearch($terms: [String]! = [\"neuron\"], $offset: Int = 0, $first: Int = 10) {\n  geneSetTermSearch(terms: $terms, offset: $offset, first: $first) {\n    nodes {\n      id\n      term\n      gse\n      platform\n      pmid\n      publishedDate\n      sampleGroups\n      title\n      geneSetById {\n        nGeneIds\n        species\n        __typename\n      }\n      __typename\n    }\n    totalCount\n    __typename\n  }\n}\n"}
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }

    res = requests.post(rummageo_url, data=json.dumps(query), headers=headers)
    return res.json()

In [4]:
control_terms = ['wt', 'wildtype', 'control', 'cntrl', 'ctrl', 'uninfected', 'normal', 'untreated', 'unstimulated', 'shctrl', 'ctl', 'healthy', 'sictrl', 'sicontrol', 'ctr', 'wild', 'dmso']
regex = re.compile('[^a-zA-Z]')
def is_control(term):
	split_terms = term.lower().split(" ")
	for t in split_terms:
		t = regex.sub('', t)
		if t in control_terms:
			return True
	else:
		return False

In [5]:
def get_meta(search_term):
	data = query_metadata(search_term)
	limit = data['data']['geneSetTermSearch']['totalCount']
	if limit > 10000:
		data = query_metadata(search_term, limit)
	nodes = data['data']['geneSetTermSearch']['nodes']
	print("total number of %s results: %d"%(search_term, len(nodes)))
	df = pd.DataFrame(index=[], columns=['id', 'title', 'term', 'direction', 'condition_1', 'condition_2', 'is_control_1', 'is_control_2', 'status', 'search_term'])
	failed = []
	for i in nodes:
		try:
			index = len(df.index) + 1
			id = i["id"]
			title = i["title"]
			term, direction = i["term"].split(" ")
			c = term.split("-")
			cond1 = c[1]
			cond2 = c[3]
			sampleGroups = i['sampleGroups']['titles']
			cont1 = is_control(sampleGroups[cond1])
			cont2 = is_control(sampleGroups[cond2])
			status = "signature"
			if cont1 and cont2:
				status = "2 control"
			if not cont1 and not cont2:
				status = "2 pert"
			if not cont1 and cont2:
				status = "reversed"
			df.loc[index] = [
				id,
				title,
				term,
				direction,
				sampleGroups[cond1],
				sampleGroups[cond2],
				cont1,
				cont2,
				status,
				search_term
			]
		except Exception as e:
			failed.append(i)
	df.to_csv('out/%s.tsv'%search_term, sep="\t")
	valid = len(df[df.status == 'signature'].index)
	reversed = len(df[df.status == 'reversed'].index)
	two_cont = len(df[df.status == '2 control'].index)
	two_pert = len(df[df.status == '2 pert'].index)
	return valid, reversed, two_cont, two_pert

In [6]:
sig_df = pd.DataFrame(index=[], columns=["valid", "reversed", "two control", "two perturbation"])

In [7]:
sig_df.loc['knockout'] = get_meta('knockout')

total number of knockout results: 5536


In [8]:
sig_df.loc['crispr ko'] = get_meta('crispr ko')

total number of crispr ko results: 2


In [9]:
sig_df.loc['overexpression'] = get_meta('overexpression')

total number of overexpression results: 2576


In [10]:
sig_df.loc['inhibition'] = get_meta('inhibition')

total number of inhibition results: 10119


In [13]:
sig_df.loc['knockdown'] = get_meta('knockdown')

total number of knockdown results: 3728


In [11]:
sig_df

Unnamed: 0,valid,reversed,two control,two perturbation
knockout,2824,32,1037,1639
crispr ko,2,0,0,0
overexpression,1028,18,333,1197
inhibition,3207,610,2152,3947


In [12]:
df = pd.read_csv('out/knockout.tsv', sep="\t", index_col=0)
df.groupby('status').first().to_csv("out/sample_per_status.tsv", sep="\t")

In [15]:
from glob import glob

In [16]:
df = None
initialized = False
for f in glob("out/*.tsv"):
	if not initialized:
		df = pd.read_csv(f, sep="\t", index_col=0)
		initialized = True
	else:
		df = pd.concat([df, pd.read_csv(f, sep="\t", index_col=0)])

In [26]:
df = df.groupby('id').first()
df.shape

(20925, 9)

In [27]:
df = df[(df.status == 'signature') | (df.status == 'reversed')]
df.shape

(8644, 9)

In [28]:
df.head()

Unnamed: 0_level_0,title,term,direction,condition_1,condition_2,is_control_1,is_control_2,status,search_term
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0005676b-ed51-4ee2-be8a-4d91a4e6f766,Effect of arcA overexpression on murine liver ...,GSE191295-2-vs-1-mouse.tsv,dn,liver gfp strain db/db age 10wk control,liver arca strain db/db age 10wk overexpression,True,False,signature,overexpression
00074c22-6e98-4698-afc0-0752e9d88220,RNA seq of liver homogenates collected every 4...,GSE95357-2-vs-11-mouse.tsv,up,zt8 wt strain c57bl/6j sex female collection t...,zt0 ko strain c57bl/6j sex female collection t...,True,False,signature,knockout
000ffa31-e6ed-47d5-90e7-5280b5160689,Effect of TUNAR silencing and GSK3 inhibition ...,GSE99503-0-vs-3-human,up,neg sirna passages scramble dmso endoc î²h1 ce...,situnar + 1 akp passages sirna tunar sirnas en...,True,False,signature,inhibition
0012060a-be58-4451-bec8-e733bf347b1d,Genomic characteristics of RARRES1 wild type a...,GSE131161-2-vs-6-mouse,up,wild type embryo [t00 strain background c57bl/...,rarres1 ko 10 month male [t10 strain backgroun...,True,False,signature,knockout
00125985-1bd4-4958-b767-1f2385ffbdd0,Exploring the gene expression profile upon FXR...,GSE101754-5-vs-2-human,up,h358 ctrl+d3 rep cell line ctrl 3 days lung/br...,h358 sh3 d5 rep cell line fxr1 kd 5 days lung/...,True,False,signature,knockdown


In [29]:
import time

In [41]:
def get_geneset(uid):
	query = {"operationName":"ViewGeneSet","variables":{"id":uid},"query":"query ViewGeneSet($id: UUID!) {\n  geneSet(id: $id) {\n    genes {\n      nodes {\n        symbol\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"}
	headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
	for i in range(5):    
		res = requests.post(rummageo_url, data=json.dumps(query), headers=headers)
		if res.ok:
			time.sleep(0.2)
			return [i["symbol"] for i in res.json()["data"]["geneSet"]["genes"]["nodes"]]
		else:
			time.sleep(i+1)
	else:
		return False
	

In [None]:
get_geneset()

In [45]:
failed = []
with open('out/rummageo_gene_perturbations.gmt', 'w') as o:
	for uid, val in df.iterrows():
		title = val["title"]
		term = val["term"].replace(".tsv", "")
		gse, cond1, _, cond2, species = term.split("-")
		status = val["status"]
		direction = val["direction"]
		expr = cond1 + "_v_" + cond2
		if status == "reversed":
			expr = cond2 + "_v_" + cond1
			direction = "dn" if direction == 'up' else 'dn'
		label = "%s_%s_%s_%s_%s"%(gse, title, expr, species, direction)
		g = get_geneset(uid)
		if g:
			o.write("\t".join([label, "", *g]) + "\n")
		else:
			failed.append(uid)

		

In [46]:
len(failed)

0

In [47]:
res = requests.get("https://s3.dev.maayanlab.cloud/sigcom-lincs/ranker/signatures_meta.json")
signatures = res.json()

In [54]:
drugs = set(i["pert_name"] for i in signatures.values())
len(drugs)

33609

In [59]:
from tqdm import tqdm

In [None]:
df = pd.DataFrame(index=[], columns=['id', 'title', 'term', 'direction', 'condition_1', 'condition_2', 'is_control_1', 'is_control_2', 'status', 'search_term'])


In [63]:
df.shape

(8170, 10)

In [None]:
start = False
for drug in tqdm(drugs):
	if drug == 'BRD-K22420960':
		start = True
	if start:
		data = query_metadata(str(drug))
		time.sleep(0.2)
		limit = data['data']['geneSetTermSearch']['totalCount']
		if limit > 0:
			if limit > 10000:
				data = query_metadata(drug, limit)
			nodes = data['data']['geneSetTermSearch']['nodes']
			print("total number of %s results: %d"%(drug, len(nodes)))
			failed = []
			for i in nodes:
				try:
					index = len(df.index) + 1
					id = i["id"]
					title = i["title"]
					term, direction = i["term"].split(" ")
					c = term.split("-")
					cond1 = c[1]
					cond2 = c[3]
					sampleGroups = i['sampleGroups']['titles']
					cont1 = is_control(sampleGroups[cond1])
					cont2 = is_control(sampleGroups[cond2])
					status = "signature"
					if cont1 and cont2:
						status = "2 control"
					if not cont1 and not cont2:
						status = "2 pert"
					if not cont1 and cont2:
						status = "reversed"
					df.loc[index] = [
						id,
						title,
						term,
						direction,
						sampleGroups[cond1],
						sampleGroups[cond2],
						cont1,
						cont2,
						status,
						drug
					]
				except Exception as e:
					failed.append(i)
		

In [66]:
df.shape

(74703, 10)

In [67]:
df.to_csv('out/drugs.tsv')

In [71]:
df = df[(df.status == 'signature') | (df.status == 'reversed')]

In [72]:
df.shape

(24241, 10)

In [74]:
df.tail()

Unnamed: 0,id,title,term,direction,condition_1,condition_2,is_control_1,is_control_2,status,search_term
74697,5f1676af-4a7f-4131-bf1e-447fb8d104a3,Effect of ezetimibe treatment on gene expressi...,GSE225982-1-vs-0-human,up,primary lung fibroblast negative control biol ...,primary lung fibroblast ezetimibe treated biol...,True,False,signature,ezetimibe
74700,38f77bcb-9632-4a9a-bc43-3c3dc0afcbea,Effect of donepezil on lung fibrosis related p...,GSE211224-2-vs-1-mouse.tsv,dn,lung control,lung bleomycin,True,False,signature,donepezil
74701,a1805a1a-4eaa-4f07-a3a7-d3a213fdacbe,Effect of donepezil on lung fibrosis related p...,GSE211224-2-vs-1-mouse.tsv,up,lung control,lung bleomycin,True,False,signature,donepezil
74702,e7985a7e-196c-4313-bedb-4f31cf536287,Effect of donepezil on lung fibrosis related p...,GSE211224-2-vs-0-mouse.tsv,up,lung control,lung donepezil,True,False,signature,donepezil
74703,f4437d2c-1363-4051-a321-8733d98da298,Effect of donepezil on lung fibrosis related p...,GSE211224-2-vs-0-mouse.tsv,dn,lung control,lung donepezil,True,False,signature,donepezil


In [76]:
df.loc[74697, 'title']

'Effect of ezetimibe treatment on gene expression of TGFβ1-activated primary human lung fibroblasts'

In [79]:
df.search_term.nunique()

337

In [82]:
df.groupby('search_term')["id"].count().to_csv("drug_count.tsv", sep="\t")

In [83]:
drugs = pd.read_csv('drug_count.tsv', sep="\t", index_col=0)
drugs.head()

Unnamed: 0_level_0,id
search_term,Unnamed: 1_level_1
1B,566
3-deazaadenosine,6
A-366,2
A-66,13
A-7,4


In [84]:
drugs.id.sum()

np.int64(22595)

In [91]:
df = df.groupby('id').first()
df.shape

(21550, 9)

In [93]:
failed = []
with open('out/rummageo_chemical_perturbations.gmt', 'w') as o:
	for uid, val in tqdm(df.iterrows()):
		search_term = val["search_term"]
		if search_term in drugs.index:
			title = val["title"]
			term = val["term"].replace(".tsv", "")
			gse, cond1, _, cond2, species = term.split("-")
			status = val["status"]
			
			direction = val["direction"]
			expr = cond1 + "_v_" + cond2
			if status == "reversed":
				expr = cond2 + "_v_" + cond1
				direction = "dn" if direction == 'up' else 'dn'
			label = "%s_%s_%s_%s_%s_%s"%(gse, title, expr, search_term, species, direction)
			g = get_geneset(uid)
			if g:
				o.write("\t".join([label, "", *g]) + "\n")
			else:
				failed.append(uid)

			

21550it [1:58:20,  3.04it/s]


In [99]:
gmt = {}
counter = 0
with open('out/rummageo_chemical_perturbations.gmt') as o:
	for line in o:
		counter +=1 
		label, _, *genes = line.strip().split("\t")
		gmt[label] = genes
counter

20107

In [100]:
len(gmt)

19451

In [2]:
removal = pd.read_csv('drug_count-marked_for_removal.txt', sep="\t", index_col=0)
removal.head()

Unnamed: 0_level_0,id,include
search_term,Unnamed: 1_level_1,Unnamed: 2_level_1
1B,566,x
3-deazaadenosine,6,
A-366,2,
A-66,13,
A-7,4,


In [3]:
remove = removal[removal.include == 'x'].index


In [5]:
removal[removal.include == 'x']

Unnamed: 0_level_0,id,include
search_term,Unnamed: 1_level_1,Unnamed: 2_level_1
1B,566,x
ATPA,86,x
AVA,391,x
C-1,261,x
CDC,209,x
FIT,266,x
ITE,2286,x
PIT,3904,x
RITA,92,x
TRIM,452,x


In [104]:
failed = []
with open('out/rummageo_chemical_perturbations2.gmt', 'w') as o:
	for uid, val in tqdm(df.iterrows()):
		search_term = val["search_term"]
		if search_term in drugs.index and search_term not in remove:
			title = val["title"]
			term = val["term"].replace(".tsv", "")
			gse, cond1, _, cond2, species = term.split("-")
			status = val["status"]
			
			direction = val["direction"]
			expr = cond1 + "_v_" + cond2
			if status == "reversed":
				expr = cond2 + "_v_" + cond1
				direction = "dn" if direction == 'up' else 'dn'
			label = "%s_%s_%s_%s_%s_%s"%(gse, title, expr, search_term, species, direction)
			g = gmt[label]
			if g:
				o.write("\t".join([label, "", *g]) + "\n")
			else:
				failed.append(uid)

			

21550it [00:06, 3554.39it/s]


In [247]:
import obonet
from drug_named_entity_recognition import find_drugs
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup

In [248]:
url = 'https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo'
graph = obonet.read_obo(url)
cell_line_mapper = {}
for k,v in graph.nodes(data=True):
	name = v["name"]
	cell_line_mapper[name.lower()] = name
	if "synonym" in v:
		for syn in v["synonym"]:
			s = syn.split('"')[1].lower()
			cell_line_mapper[s] = name

def cell_line_verifier(name):
	return cell_line_mapper.get(name.lower())



In [249]:
url = "https://purl.obolibrary.org/obo/cl/cl-basic.obo"
graph = obonet.read_obo(url)
cell_type_mapper = {}
for k,v in graph.nodes(data=True):
	name = v["name"]
	if "subset" in v:
		if 'human_reference_atlas' in v["subset"]:
			cell_type_mapper[name.lower()] = name
			if "synonym" in v:
				for syn in v["synonym"]:
					s = syn.split('"')[1].lower()
					cell_type_mapper[s] = name

def cell_type_verifier(name):
	return cell_type_mapper.get(name.lower())


In [250]:
url = "https://purl.obolibrary.org/obo/uberon/uberon-basic.obo"
graph = obonet.read_obo(url)
subsets = set()

tissue_mapper = {}
for k,v in graph.nodes(data=True):
	if "subset" in v and ('major_organ' in v["subset"] or 'organ_slim' in v["subset"]):
		if "name" in v:
			name = v["name"]
			tissue_mapper[name.lower()] = name
			if "synonym" in v:
				for syn in v["synonym"]:
					s = syn.split('"')[1].lower()
					tissue_mapper[s] = name
			if "subset" in v:
				subsets = subsets.union(v["subset"])

def tissue_verifier(name):
	return tissue_mapper.get(name.lower())

In [251]:
url = "http://purl.obolibrary.org/obo/mondo.obo"
graph = obonet.read_obo(url)

disease_mapper = {}
for k,v in graph.nodes(data=True):
	if "name" in v:
		name = v["name"]
		disease_mapper[name.lower()] = name
		if "synonym" in v:
			for syn in v["synonym"]:
				s = syn.split('"')[1].lower()
				disease_mapper[s] = name

def disease_verifier(name):
	return disease_mapper.get(name.lower())

In [252]:
def drug_verifier(name):
	results = find_drugs([name])
	if results:
		return results[0][0]['name']
	else: return None


In [253]:
human_genes = ncbi_genes_lookup()
mouse_genes = ncbi_genes_lookup(organism='Mammalia/Mus_musculus')

In [254]:
def gene_verifier(name):
	return human_genes(name) or mouse_genes(name)

In [255]:
from inflector import Inflector
inflector = Inflector()

In [306]:
def named_entity_recognition(label):
	data = {
		"cell_line": [],
		"cell_type": [],
		"tissue": [],
		"gene": [],
		"drug": [],
		"disease": [],
	}
	for size in [3, 2, 1]:
		tokens = label.split(" ")
		for start in range(len(tokens)):
			end = start + size
			if end > len(tokens):
				break
			i = " ".join(tokens[start: end])
			i = i.replace("(","").replace(")","").replace(".","").replace(",","")
			if i.lower() not in ['the', 'a', 'is', 'are', 'in', 'on', 'of', 'at', 'for', 'or', 'to', 'via', 'mice', "gene expression", "of", "an", "cancer", "as", "vs", "wt", "ko", "age", "secretase", "not", "set", "ii", "hypoxia-induced", "po", 'dna', 't', 'beige', 'beta', 'utr', 'b', 'or', 'de']:
				singular = inflector.singularize(i)
				if not i.isnumeric() and i.lower() not in ['one', 'mouse', 'cl']:
					r = cell_line_verifier(i)
					if r:
						data['cell_line'].append(r)
						label = label.replace(i, "")
						continue
				r = cell_type_verifier(singular)
				if r:
					data['cell_type'].append(r)
					label = label.replace(i, "")
					continue
				r = tissue_verifier(singular)
				if r:
					data['tissue'].append(r)
					label = label.replace(i, "")
					continue
				r = drug_verifier(i)
				if r:
					data['drug'].append(r)
					label = label.replace(i, "")
					continue
				if not i.isnumeric() and not i.lower() in ['fat', 'homodimer', 'embryonic', 'vs', 'w', 'cl', 'brown']:
					r = gene_verifier(i.replace("(","").replace(")",""))
					if r:
						data['gene'].append(i.replace("(","").replace(")",""))
						label = label.replace(i, "")
						continue
				r = disease_verifier(i)
				if r:
					data['disease'].append(r)
					label = label.replace(i, "")
					continue
	return data
			

In [307]:
named_entity_recognition("RNA-Sequencing analysis of liver tissue from female Foxa3-Cre YAP1 knockout mice, Foxa3-Cre YAP1 KO TAZ heterozygous mice, and WT littermate controls, at 3-4 months of age")


{'cell_line': [],
 'cell_type': [],
 'tissue': ['liver'],
 'gene': ['YAP1', 'YAP1'],
 'drug': [],
 'disease': ['tissue']}

In [308]:
for term in 'Inhibition of UBA52 induces autophagy via EMC6 to suppress hepatocellular carcinoma tumorigenesis and progression'.split():
	term = term.replace('(','').replace(')','')
	print(term, cell_type_verifier(term))


Inhibition None
of None
UBA52 None
induces None
autophagy None
via None
EMC6 None
to None
suppress None
hepatocellular None
carcinoma None
tumorigenesis None
and None
progression None


In [309]:
cell_line_verifier('WT')

'STA-WT-1'

In [310]:
from glob import glob

In [311]:
df = None
initialized = False
for f in glob("out/*.tsv"):
	if not f == "out/drugs.tsv":
		if not initialized:
			df = pd.read_csv(f, sep="\t", index_col=0)
			initialized = True
		else:
			df = pd.concat([df, pd.read_csv(f, sep="\t", index_col=0)])

In [312]:
df.head()

Unnamed: 0,id,title,term,direction,condition_1,condition_2,is_control_1,is_control_2,status,search_term
1,00025b3e-0688-4768-a8d9-7c4c7f8fbc91,Hepatocyte-specific knockout of Nicotinamide p...,GSE144443-10-vs-0-mouse.tsv,dn,wt 3d full liver [022 strain background c57bl/...,wt 21d full liver [022 strain background c57bl...,True,True,2 control,knockout
2,00035486-42c5-4bdf-8c05-fd1d6c5ff94e,RNA sequencing of ADIPOR2 knockout HEK293 cell...,GSE158834-10-vs-7-human,dn,wt pa 9h cell embryonic kidney line hek293t 1%...,wt bsa 24h cell embryonic kidney line hek293t ...,True,True,2 control,knockout
3,00065324-a3a9-4f9b-b668-7eda311336aa,mSeq of vtRNA1.1 and vtRNA1.3 knockout or cont...,GSE147054-6-vs-1-human,up,vtrna1.3ko control condition cell line hela vt...,vtrna1.1ko control condition cell line hela vt...,True,True,2 control,knockout
4,00074c22-6e98-4698-afc0-0752e9d88220,RNA seq of liver homogenates collected every 4...,GSE95357-2-vs-11-mouse.tsv,up,zt8 wt strain c57bl/6j sex female collection t...,zt0 ko strain c57bl/6j sex female collection t...,True,False,signature,knockout
5,0012060a-be58-4451-bec8-e733bf347b1d,Genomic characteristics of RARRES1 wild type a...,GSE131161-2-vs-6-mouse,up,wild type embryo [t00 strain background c57bl/...,rarres1 ko 10 month male [t10 strain backgroun...,True,False,signature,knockout


In [313]:
df = df.groupby('id').first()
df = df[(df.status == 'signature') | (df.status == 'reversed')]
df.shape

(8644, 9)

In [314]:
gmt = {}
with open('out/rummageo_gene_perturbations.gmt') as o:
	for line in o:
		label, _, *genes = line.strip().split("\t")
		gmt[label] = genes
len(gmt)

8315

In [315]:
gmt.keys()

dict_keys(['GSE191295_Effect of arcA overexpression on murine liver gene expression profiles_2_v_1_mouse_dn', 'GSE95357_RNA seq of liver homogenates collected every 4 hours for one 24 hour cycle from adipocyte knockouts for Bmal1_2_v_11_mouse_up', 'GSE99503_Effect of TUNAR silencing and GSK3 inhibition on human b-cell transcriptome_0_v_3_human_up', 'GSE131161_Genomic characteristics of RARRES1 wild type and knockout mice lung tissues_2_v_6_mouse_up', 'GSE101754_Exploring the gene expression profile upon FXR1 knockdown in H358 cells using RNA-seq_5_v_2_human_up', 'GSE158834_RNA sequencing of ADIPOR2 knockout HEK293 cells undergone various treatments with palmitic acid_2_v_5_human_up', 'GSE196505,GSE196507_RNA-seq analysis to examine the effect of KDM2B knockdown or overexpression on transcriptional responses in ECs during VEGF-signaling_3_v_2_human_up', 'GSE227535_Liver-Specific Overexpression of HKDC1 Increases Hepatocyte Size and Proliferative Capacity_0_v_2_mouse_up', 'GSE230865_Poly

In [316]:
failed = []
mapped_terms_gene = {}
with open('out/rummageo_gene_perturbations_shortened.gmt', 'w') as o:
	for uid, val in df.iterrows():
		title = val["title"]
		term = val["term"].replace(".tsv", "")
		gse, cond1, _, cond2, species = term.split("-")
		status = val["status"]
		direction = val["direction"]
		expr = cond1 + "_v_" + cond2
		search_term = val['search_term']
		if status == "reversed":
			expr = cond2 + "_v_" + cond1
			direction = "dn" if direction == 'up' else 'dn'
		old_label = "%s_%s_%s_%s_%s"%(gse, title, expr, species, direction)
		if old_label in gmt:
			g = gmt[old_label]
			r = named_entity_recognition(old_label)
			label = gse.replace(",", "_")
			if r["gene"]:
				label = label + "_" + "_".join(set(r["gene"])) + " " + search_term
				if r["cell_line"]:
					label = label + "_" + "_".join(set(r["cell_line"]))
				elif r["cell_type"]:
					label = label + "_" + "_".join(set(r["cell_type"]))
				elif r["tissue"]:
					label = label + "_" + "_".join(set(r["tissue"]))
				if r["drug"]:
					label = label + "_" + "_".join(set(r["drug"]))
				label += "_%s_%s_%s"%(expr, species, direction)
				mapped_terms_gene[label.replace("/", "-")] = old_label
				if g:
					o.write("\t".join([label.replace("/", "-").replace(",", " "), "", *g]) + "\n")
				else:
					failed.append(uid)

				

In [317]:
df = pd.read_csv('out/drugs.tsv')


In [318]:
gmt = {}
with open('out/rummageo_chemical_perturbations.gmt') as o:
	for line in o:
		label, _, *genes = line.strip().split("\t")
		gmt[label] = genes
len(gmt)

19451

In [319]:
label

'GSE189579_Next Generation Sequencing Facilitates Quantitative Analysis of  ribo-tag pulled down Transcriptomes in renal proximal tubule epithelial cells from Plasmodium Chabaudi Chabaudi infected Egfp.l10Pepck and Egfp.l10PepckSlc40a1Pepckdelta/delta mice_3_v_0_PIT_mouse_dn'

In [320]:
df = df.groupby('id').first()
df = df[(df.status == 'signature') | (df.status == 'reversed')]
df.shape

(21472, 10)

In [None]:
failed = []
mapped_terms = {}
with open('out/rummageo_chemical_perturbations_shortened.gmt', 'w') as o:
	for uid, val in df.iterrows():
		title = val["title"]
		term = val["term"].replace(".tsv", "")
		gse, cond1, _, cond2, species = term.split("-")
		status = val["status"]
		direction = val["direction"]
		expr = cond1 + "_v_" + cond2
		search_term = val['search_term']
		if search_term not in remove:
			if status == "reversed":
				expr = cond2 + "_v_" + cond1
				direction = "dn" if direction == 'up' else 'dn'
			old_label = "%s_%s_%s_%s_%s_%s"%(gse, title, expr, search_term, species, direction)
			if old_label in gmt:
				g = gmt[old_label]
				r = named_entity_recognition(old_label)
				label = gse.replace(",","_")
				# if r["drug"]:
				# 	label = label + "_" + ",".join(set([search_term.lower()] + [i.lower() for i in r["drug"]]))
				# else:
				label = label + "_" + search_term 
				if r["cell_line"]:
					label = label + "_" + ",".join(set(r["cell_line"]))
				elif r["cell_type"]:
					label = label + "_" + ",".join(set(r["cell_type"]))
				elif r["tissue"]:
					label = label + "_" + ",".join(set(r["tissue"]))
				# if r["disease"]:
				# 	label = label + "_" + ",".join(set(r["disease"]))
				# if r["gene"]:
				# 	label = label + "_" + ",".join(set(r["gene"]))
				label += "_%s_%s_%s"%(expr, species, direction)
				mapped_terms[label] = old_label
				for_print = False
				for key,val in r.items():
					if not key in ['disease', 'drug']:
						if len(set(val)) > 1:
							print(key, set(val))
							for_print = True
				if for_print:
					print(old_label)
			
				if g:
					o.write("\t".join([label.replace("/", "-").replace(",", " "), "", *g]) + "\n")
				else:
					failed.append(uid)

				

In [322]:
cell_line_verifier('K14 Cre')