# RummaGEO Drug Perturbations Processing Script
This notebook contains the script used to process the RummaGEO Drug Perturbation Signatures dataset for Hamronizome. The script is adapted from the [script](https://github.com/MaayanLab/EnrichrPythonScripts/blob/master/Enrichr/rummageo_pert_sigs.ipynb) used to process the RummaGEO perturbation signatures for Enrichr created by Erol Evangelista.

The signatures are created by querying the RummaGEO metadata for signatures or reversed signatures where one condition matches a dictionary of control terms and the other condition contains a drug from a list of drugs from SigCom LINCS.

The script to gather drug perturbation signatures was run on 1/16/25.

In [None]:
import pandas as pd
import datetime
from glob import glob
import math
import matplotlib.pyplot as plt
import numpy as np
import os
import re
import requests
import time
import scipy.spatial.distance as dist
import seaborn as sns
import sys
import json
import scanpy as sc
from tqdm import tqdm

# UMAP
from sklearn.feature_extraction.text import TfidfVectorizer
import anndata
from collections import OrderedDict

# Bokeh
from bokeh.io import output_notebook
from bokeh.plotting import figure, show, save, output_file
from bokeh.models import HoverTool, ColumnDataSource
from bokeh.palettes import Category20
output_notebook()

from IPython.display import display, HTML, Markdown
sys.setrecursionlimit(100000)

## Get Drug Perturbation Data from RummaGEO

In [None]:
rummageo_url = "https://rummageo.com/graphql"

In [None]:
def query_metadata(term, limit=10000):
    query = {"operationName":"TermSearch","variables":{"terms":term,"offset":0,"first":limit},"query":"query TermSearch($terms: [String]! = [\"neuron\"], $offset: Int = 0, $first: Int = 10) {\n  geneSetTermSearch(terms: $terms, offset: $offset, first: $first) {\n    nodes {\n      id\n      term\n      gse\n      platform\n      pmid\n      publishedDate\n      sampleGroups\n      title\n      geneSetById {\n        nGeneIds\n        species\n        __typename\n      }\n      __typename\n    }\n    totalCount\n    __typename\n  }\n}\n"}
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }

    res = requests.post(rummageo_url, data=json.dumps(query), headers=headers)
    return res.json()

In [None]:
control_terms = ['wt', 'wildtype', 'control', 'cntrl', 'ctrl', 'uninfected', 'normal', 'untreated', 'unstimulated', 'shctrl', 'ctl', 'healthy', 'sictrl', 'sicontrol', 'ctr', 'wild', 'dmso']
regex = re.compile('[^a-zA-Z]')
def is_control(term):
	split_terms = term.lower().split(" ")
	for t in split_terms:
		t = regex.sub('', t)
		if t in control_terms:
			return True
	return False

In [None]:
def get_meta(search_term):
	data = query_metadata(search_term)
	limit = data['data']['geneSetTermSearch']['totalCount']
	if limit > 10000:
		data = query_metadata(search_term, limit)
	nodes = data['data']['geneSetTermSearch']['nodes']
	print(f'total number of {search_term} results: {len(nodes)}')
	df = pd.DataFrame(index=[], columns=['id', 'title', 'term', 'direction', 'condition_1', 'condition_2', 'is_control_1', 'is_control_2', 'status', 'search_term'])
	failed = []
	for i in nodes:
		try:
			index = len(df.index) + 1
			id = i["id"]
			title = i["title"]
			term, direction = i["term"].split(" ")
			c = term.split("-")
			cond1 = c[1]
			cond2 = c[3]
			sampleGroups = i['sampleGroups']['titles']
			cont1 = is_control(sampleGroups[cond1])
			cont2 = is_control(sampleGroups[cond2])
			status = "signature"
			if cont1 and cont2:
				status = "2 control"
			if not cont1 and not cont2:
				status = "2 pert"
			if not cont1 and cont2:
				status = "reversed"
			df.loc[index] = [
				id,
				title,
				term,
				direction,
				sampleGroups[cond1],
				sampleGroups[cond2],
				cont1,
				cont2,
				status,
				search_term
			]
		except Exception as e:
			failed.append(i)
	df.to_csv(f'out/{search_term}.tsv', sep="\t")
	valid = len(df[df.status == 'signature'].index)
	reversed = len(df[df.status == 'reversed'].index)
	two_cont = len(df[df.status == '2 control'].index)
	two_pert = len(df[df.status == '2 pert'].index)
	return valid, reversed, two_cont, two_pert

In [None]:
res = requests.get("https://s3.dev.maayanlab.cloud/sigcom-lincs/ranker/signatures_meta.json")
signatures = res.json()

In [None]:
drugs = set(i["pert_name"] for i in signatures.values())
len(drugs)

In [None]:
list(drugs).index('BRD-K22420960')

In [None]:
'''df = pd.read_csv('out/drugs.tsv', sep='\t', index_col=0)
df'''

In [None]:
df = pd.DataFrame(index=[], columns=['id', 'title', 'term', 'direction', 'condition_1', 'condition_2', 'is_control_1', 'is_control_2', 'status', 'search_term'])

In [None]:
df.shape

In [None]:
start = False
for drug in tqdm(drugs):
	if drug == 'BRD-K22420960':
		start = True
	if start:
		data = query_metadata(str(drug))
		time.sleep(0.2)
		limit = data['data']['geneSetTermSearch']['totalCount']
		if limit > 0:
			if limit > 10000:
				data = query_metadata(drug, limit)
			nodes = data['data']['geneSetTermSearch']['nodes']
			print("total number of %s results: %d"%(drug, len(nodes)))
			failed = []
			for i in nodes:
				try:
					index = len(df.index) + 1
					pertid = i["id"]
					title = i["title"]
					term, direction = i["term"].split(" ")
					c = term.split("-")
					cond1 = c[1]
					cond2 = c[3]
					sampleGroups = i['sampleGroups']['titles']
					cont1 = is_control(sampleGroups[cond1])
					cont2 = is_control(sampleGroups[cond2])
					status = "signature"
					if cont1 and cont2:
						status = "2 control"
					if not cont1 and not cont2:
						status = "2 pert"
					if not cont1 and cont2:
						status = "reversed"
					df.loc[index] = [
						pertid,
						title,
						term,
						direction,
						sampleGroups[cond1],
						sampleGroups[cond2],
						cont1,
						cont2,
						status,
						drug
					]
				except Exception as e:
					failed.append(i)

In [None]:
df = df.drop_duplicates().reset_index(drop=True)
df.index += 1
df

In [None]:
df.to_csv('out/drugs.tsv.gz', compression='gzip', sep='\t')

In [None]:
df = pd.read_csv('out/drugs.tsv.gz', sep='\t', compression='gzip', index_col=0)
df

In [None]:
df = df[(df.status == 'signature') | (df.status == 'reversed')]

In [None]:
df.shape

In [None]:
df.search_term.nunique()

In [None]:
df.groupby('search_term')["id"].count().to_csv("drug_count.tsv", sep="\t")

In [None]:
drugs = pd.read_csv('drug_count.tsv', sep="\t", index_col=0)
drugs.head()

In [None]:
drugs.id.sum()

In [None]:
df = df.groupby('id').first()
df.shape

In [None]:
def get_geneset(uid):
	query = {"operationName":"ViewGeneSet","variables":{"id":uid},"query":"query ViewGeneSet($id: UUID!) {\n  geneSet(id: $id) {\n    genes {\n      nodes {\n        symbol\n        __typename\n      }\n      __typename\n    }\n    __typename\n  }\n}\n"}
	headers = {
	    "Accept": "application/json",
	    "Content-Type": "application/json"
	}
	for i in range(5):
		res = requests.post(rummageo_url, data=json.dumps(query), headers=headers)
		if res.ok:
			time.sleep(0.2)
			return [i["symbol"] for i in res.json()["data"]["geneSet"]["genes"]["nodes"]]
		else:
			time.sleep(i+1)
	return False
	

In [None]:
failed = []
with open('out/rummageo_chemical_perturbations.gmt', 'w') as o:
	for uid, val in tqdm(df.iterrows()):
		search_term = val["search_term"]
		if search_term in drugs.index:
			title = val["title"]
			term = val["term"].replace(".tsv", "")
			gse, cond1, _, cond2, species = term.split("-")
			status = val["status"]
			
			direction = val["direction"]
			expr = cond1 + "_v_" + cond2
			if status == "reversed":
				expr = cond2 + "_v_" + cond1
				direction = "dn" if direction == 'up' else 'dn'
			label = "%s_%s_%s_%s_%s_%s"%(gse, title, expr, search_term, species, direction)
			if label in gmt:
				continue
			g = get_geneset(uid)
			if g:
				o.write("\t".join([label, "", *g]) + "\n")
			else:
				failed.append(uid)

			

In [None]:
gmt = {}
counter = 0
with open('out/rummageo_chemical_perturbations.gmt') as o:
	for line in o:
		counter +=1 
		label, _, *genes = line.strip().split("\t")
		gmt[label] = genes
counter

In [None]:
len(gmt)

In [None]:
remove = {'1B', 'ATPA', 'AVA', 'C-1', 'CDC', 'FIT', 'ITE', 'PIT', 'RITA', 'TRIM', 'compe', 'iq', 'niacin', 'pen', 'rutin'}
remove

In [None]:
failed = []
with open('out/rummageo_chemical_perturbations2.gmt', 'w') as o:
	for uid, val in tqdm(df.iterrows()):
		search_term = val["search_term"]
		if search_term in drugs.index and search_term not in remove:
			title = val["title"]
			term = val["term"].replace(".tsv", "")
			gse, cond1, _, cond2, species = term.split("-")
			status = val["status"]
			
			direction = val["direction"]
			expr = cond1 + "_v_" + cond2
			if status == "reversed":
				expr = cond2 + "_v_" + cond1
				direction = "dn" if direction == 'up' else 'dn'
			label = "%s_%s_%s_%s_%s_%s"%(gse, title, expr, search_term, species, direction)
			g = gmt[label]
			if g:
				o.write("\t".join([label, "", *g]) + "\n")
			else:
				failed.append(uid)

			

In [None]:
import obonet
from drug_named_entity_recognition import find_drugs
from maayanlab_bioinformatics.harmonization.ncbi_genes import ncbi_genes_lookup

In [None]:
url = 'https://ftp.expasy.org/databases/cellosaurus/cellosaurus.obo'
graph = obonet.read_obo(url)
cell_line_mapper = {}
for k,v in graph.nodes(data=True):
	name = v["name"]
	cell_line_mapper[name.lower()] = name
	if "synonym" in v:
		for syn in v["synonym"]:
			s = syn.split('"')[1].lower()
			cell_line_mapper[s] = name

def cell_line_verifier(name):
	return cell_line_mapper.get(name.lower())

In [None]:
url = "https://purl.obolibrary.org/obo/cl/cl-basic.obo"
graph = obonet.read_obo(url)
cell_type_mapper = {}
for k,v in graph.nodes(data=True):
	name = v["name"]
	if "subset" in v:
		if 'human_reference_atlas' in v["subset"]:
			cell_type_mapper[name.lower()] = name
			if "synonym" in v:
				for syn in v["synonym"]:
					s = syn.split('"')[1].lower()
					cell_type_mapper[s] = name

def cell_type_verifier(name):
	return cell_type_mapper.get(name.lower())


In [None]:
url = "https://purl.obolibrary.org/obo/uberon/uberon-basic.obo"
graph = obonet.read_obo(url)
subsets = set()

tissue_mapper = {}
for k,v in graph.nodes(data=True):
	if "subset" in v and ('major_organ' in v["subset"] or 'organ_slim' in v["subset"]):
		if "name" in v:
			name = v["name"]
			tissue_mapper[name.lower()] = name
			if "synonym" in v:
				for syn in v["synonym"]:
					s = syn.split('"')[1].lower()
					tissue_mapper[s] = name
			if "subset" in v:
				subsets = subsets.union(v["subset"])

def tissue_verifier(name):
	return tissue_mapper.get(name.lower())

In [None]:
url = "http://purl.obolibrary.org/obo/mondo.obo"
graph = obonet.read_obo(url)

disease_mapper = {}
for k,v in graph.nodes(data=True):
	if "name" in v:
		name = v["name"]
		disease_mapper[name.lower()] = name
		if "synonym" in v:
			for syn in v["synonym"]:
				s = syn.split('"')[1].lower()
				disease_mapper[s] = name

def disease_verifier(name):
	return disease_mapper.get(name.lower())

In [None]:
def drug_verifier(name):
	results = find_drugs([name])
	if results:
		return results[0][0]['name']
	return None

In [None]:
human_genes = ncbi_genes_lookup()
mouse_genes = ncbi_genes_lookup(organism='Mammalia/Mus_musculus')

def gene_verifier(name):
	return human_genes(name) or mouse_genes(name)

In [None]:
from inflector import Inflector
inflector = Inflector()

In [None]:
def named_entity_recognition(label):
	data = {
		"cell_line": [],
		"cell_type": [],
		"tissue": [],
		"gene": [],
		"expression": [],
		"drug": [],
		"disease": [],
	}
	for size in [3, 2, 1]:
		tokens = label.split(" ")
		for start in range(len(tokens)):
			end = start + size
			if end > len(tokens):
				break
			i = " ".join(tokens[start: end])
			i = i.replace("(","").replace(")","").replace(".","").replace(",","")
			if i.lower() not in ['the', 'a', 'is', 'are', 'in', 'on', 'of', 'at', 'for', 'or', 'to', 'via', 'mice', "gene expression", "of", "an", "cancer", "as", "vs", "wt", "ko", "age", "secretase", "not", "set", "ii", "hypoxia-induced", "po", 'dna', 't', 'beige', 'beta', 'utr', 'b', 'or', 'de']:
				singular = inflector.singularize(i)
				if not i.isnumeric() and i.lower() not in ['one', 'mouse', 'cl']:
					r = cell_line_verifier(i)
					if r:
						data['cell_line'].append(r)
						label = label.replace(i, "")
						continue
				r = cell_type_verifier(singular)
				if r:
					data['cell_type'].append(r)
					label = label.replace(i, "")
					continue
				r = tissue_verifier(singular)
				if r:
					data['tissue'].append(r)
					label = label.replace(i, "")
					continue
				r = drug_verifier(i)
				if r:
					data['drug'].append(r)
					label = label.replace(i, "")
					continue
				if not i.isnumeric() and not i.lower() in ['fat', 'homodimer', 'embryonic', 'vs', 'w', 'cl', 'brown']:
					r = gene_verifier(i.replace("(","").replace(")",""))
					if r:
						data['gene'].append(i.replace("(","").replace(")",""))
						label = label.replace(i, "")
						continue
				r = disease_verifier(i)
				if r:
					data['disease'].append(r)
					label = label.replace(i, "")
					continue
	return data

In [None]:
conditionset = set(df['condition_1'].to_list() + df['condition_2'].to_list())
conditionset
conditions_short = {}

for cond in conditionset:
    if len(cond) >= 128:
        conditions_short[cond] = cond.split('.')[0]

In [None]:
failed = []
mapped_terms = {}
new_gmt = {}
conds = {}
with open('out/rummageo_chemical_perturbations_shortened.gmt', 'w') as o:
	for uid, val in df.iterrows():
		title = val["title"]
		term = val["term"].replace(".tsv", "")
		gse, cond1, _, cond2, species = term.split("-")
		condition1 = conditions_short[val["condition_1"]] if val["condition_1"] in conditions_short else val["condition_1"]
		condition2 = conditions_short[val["condition_2"]] if val["condition_2"] in conditions_short else val["condition_2"]
		conditions = (condition1, condition2)
		status = val["status"]
		direction = val["direction"]
		expr = f"{cond1}_v_{cond2}"
		search_term = val['search_term']
		if search_term not in remove:
			if status == "reversed":
				expr = f"{cond2}_v_{cond1}"
				conditions = (condition2, condition1)
				direction = "dn" if direction == 'up' else 'dn'
			old_label = f"{gse}_{title}_{expr}_{search_term}_{species}_{direction}"
			if old_label in gmt:
				g = gmt[old_label]
				r = named_entity_recognition(old_label)
				label = gse.replace(",","_")
				label = f"{label}_{search_term}"
				if r["cell_line"]:
					label = f"{label}_" + ",".join(set(r["cell_line"]))
				elif r["cell_type"]:
					label = f"{label}_" + ",".join(set(r["cell_type"]))
				elif r["tissue"]:
					label = f"{label}_" + ",".join(set(r["tissue"]))
				label += f"_{expr}_{species}_{direction}"
				mapped_terms[label] = old_label
				if g:
					o.write("\t".join([label.replace("/", "-").replace(",", " "), "", *g]) + "\n")
					new_gmt[label.replace("/", "-").replace(",", " ")] = set(g)
					conds[label.replace("/", "-").replace(",", " ").replace("_up","").replace("_dn","")] = conditions
				else:
					failed.append(uid)

				

### Construct DataFrame from Shortened GMT

In [None]:
def load_gmt(file):
    with open(file, 'r') as f:
        gmt = {}
        for line in f:
            term, _, *geneset = line.split('\t')
            gmt[term] = set(geneset)
        return gmt

In [None]:
new_gmt = load_gmt('out/rummageo_chemical_perturbations_shortened.gmt')

In [None]:
rummageodrugpert = pd.DataFrame(data=[new_gmt.keys(), new_gmt.values()], index=['Drug Perturbation', 'Gene']).T
rummageodrugpert = rummageodrugpert.explode('Gene')
rummageodrugpert['Direction'] = rummageodrugpert['Drug Perturbation'].apply(lambda x: {'up':1, 'dn':-1}[x.split('_')[-1]])
rummageodrugpert['Drug Perturbation'] = rummageodrugpert['Drug Perturbation'].apply(lambda x: '_'.join(x.split('_')[:-1]))
rummageodrugpert

In [None]:
orthologs = pd.read_csv('../../../mapping/source_files/gene_orthologs', sep='\t')
orthologdict = orthologs[orthologs['#tax_id']==9606][orthologs['Other_tax_id']==10090].set_index('Other_GeneID')['GeneID'].to_dict()
orthologs

In [None]:
mouse_gene_info = pd.read_csv('../../../mapping/source_files/mouse_gene_info', sep='\t')
mouse_gene_ids = mouse_gene_info[mouse_gene_info['#tax_id']==10090].set_index('Symbol')['GeneID'].to_dict()
mouse_gene_info

In [None]:
human_gene_info = pd.read_csv('../../../mapping/source_files/human_gene_info', sep='\t')
human_gene_info = human_gene_info[human_gene_info['#tax_id']==9606]
human_gene_ids = human_gene_info.set_index('GeneID')['Symbol'].to_dict()
geneinfo = human_gene_info.set_index('Symbol')[['GeneID', 'description']]
geneinfo.index = geneinfo.index.map(str.upper)
geneids = human_gene_info.set_index('Symbol')
geneids.index = geneids.index.map(str.upper)
geneids = geneids['GeneID'].to_dict()
human_gene_info

In [None]:
mouse_gene_dict = {}
for gene in mouse_gene_ids:
    if mouse_gene_ids[gene] in orthologdict:
        if orthologdict[mouse_gene_ids[gene]] in human_gene_ids:
            mouse_gene_dict[gene] = human_gene_ids[orthologdict[mouse_gene_ids[gene]]]
len(mouse_gene_dict)

human_gene_symbols = set(human_gene_info[human_gene_info['type_of_gene']=='protein-coding']['Symbol'])

In [None]:
rummageodrugpert['Gene'] = rummageodrugpert['Gene'].apply(lambda x: mouse_gene_dict[x] if x in mouse_gene_dict else x)
rummageodrugpert['Gene'] = rummageodrugpert['Gene'].apply(lambda x: x if x in human_gene_symbols else np.nan)
rummageodrugpert = rummageodrugpert.dropna().drop_duplicates(subset=['Drug Perturbation', 'Gene'], keep=False).reset_index(drop=True)
rummageodrugpert

In [None]:
'''
Drug Perturbation     2753
Gene                 18912
Direction                2
'''
rummageodrugpert.nunique()


## Process Data for SQL Ingestion

### Dataset

In [None]:
#(id, name, name_without_resource, description, association, gene_set_description, gene_sets_description, attribute_set_description, positive_association, negative_association, is_signed, is_continuous_valued, last_updated, directory, num_page_views, resource_fk, measurement_fk, dataset_group_fk, attribute_type_fk, attribute_group_fk, evidence_type, evidence_group, measurement_bias, attribute_type_plural)
(166, 'RummaGEO Drug Perturbation Signatures', 'Drug Perturbation Signatures', 'Drug perturbation signatures produced from automatically mined RNA-seq samples from GEO.', 'gene-drug perturbation associations by differential expression of gene following drug perturbation', 'genes differentially expressed following the {0} drug perturbation from the RummaGEO Drug Perturbation Signatures dataset.', 'sets of genes differentially expressed following drug perturbation from the RummaGEO Drug Perturbation Signatures dataset.', 'drug perturbations changing expression of {0} gene from the RummaGEO Drug Perturbation Signatures dataset.', 'increased expression', 'decreased expression', 1, 0, '2025-01-07', 'rummageochem', 0, 117, 16, 7, 8, 2, 'gene expression by RNA-seq', 'curated experimental data', 'high-throughput, data-driven', 'drug perturbations')

### Publication

In [None]:
(252, 166, 162)

### Genes

In [None]:
prodgenes = pd.read_csv('../../../tables/gene.csv')
prodgenelist = prodgenes['symbol'].to_list()
genefks = prodgenes.set_index('symbol')['id'].to_dict()

In [None]:
index = 58755
geneurl = 'https://ncbi.nlm.nih.gov/gene/'

for gene in rummageodrugpert['Gene'].apply(str.upper).unique():
    if gene not in prodgenelist:
        print((index, gene, geneinfo.loc[gene, 'GeneID'], geneinfo.loc[gene, 'description'], geneurl+str(geneinfo.loc[gene, 'GeneID'])), end=',\n')
        genefks[gene] = index
        index += 1

### Attributes

In [None]:
index = 510174
attributefks = {}
url = 'https://rummageo.com/term-search?q='


for drugpert in rummageodrugpert['Drug Perturbation'].unique():
    print((index, drugpert, ' vs '.join(conds[drugpert]), f'{url}{drugpert.split("_")[0]}', 111), end=',\n')
    attributefks[drugpert] = index
    index += 1

### Gene Sets

In [None]:
index = 136700000
genesetfks = {}

for drugpert in rummageodrugpert['Drug Perturbation'].unique():
    print((index, drugpert, ' vs '.join(conds[drugpert]), f'{url}{drugpert.split("_")[0]}', 166, 8, attributefks[drugpert]), end=',\n')
    genesetfks[drugpert] = index
    index += 1

In [None]:
atindex = 510174
atdict = {}

with open('attributes.txt', 'r') as f:
    for line in f:
        line = line.replace('(','').replace('),\n','').split(', ')
        atdict[line[0]] = atindex
        line[0] = str(atindex)
        atindex += 1
        print(f'({", ".join(line)}),')

### Associations

In [None]:
associations = rummageodrugpert.copy()
associations['Drug Perturbation'] = associations['Drug Perturbation'].map(genesetfks)
associations['Gene'] = associations['Gene'].map(str.upper).map(genefks)
associations.columns = ['gene_set_fk', 'gene_fk', 'threshold_value']
associations = associations[['gene_fk', 'gene_set_fk', 'threshold_value']]
associations = associations.rename_axis('id')
associations.index += 169000000
associations.to_csv('../../../harmonizome-update/rummageodrug.csv')
associations

## Create Downloads

In [None]:
output_path = 'downloads/'

### Gene-Attribute Binary Matrix

In [None]:
ternarymatrix = pd.crosstab(rummageodrugpert['Gene'], rummageodrugpert['Drug Perturbation'], values=rummageodrugpert['Direction'], aggfunc=max).replace(np.nan, 0).astype(int)
ternarymatrixT = ternarymatrix.T
ternarymatrix.to_csv(output_path+'gene_attribute_matrix.txt.gz', sep='\t', compression='gzip')
ternarymatrix

### Gene-Attribute Edge List

In [None]:
edgelist = rummageodrugpert.get(['Gene', 'Drug Perturbation', 'Direction'])
edgelist['Gene ID'] = edgelist['Gene'].map(str.upper).map(geneids)
edgelist = edgelist[['Gene', 'Gene ID', 'Drug Perturbation', 'Direction']]
edgelist.columns = ['Gene', 'Gene ID', 'Drug Perturbation', 'Threshold']
edgelist.to_csv(output_path+'gene_attribute_edges.txt.gz', sep='\t', compression='gzip')
edgelist

### Gene List

In [None]:
geneslist = edgelist.get(['Gene', 'Gene ID']).drop_duplicates().reset_index(drop=True)
geneslist.to_csv(output_path+'gene_list_terms.txt.gz', sep='\t', compression='gzip')
geneslist

### Attribute List

In [None]:
attributeslist = edgelist.get(['Drug Perturbation']).drop_duplicates().reset_index(drop=True)
attributeslist.to_csv(output_path+'attribute_list_entries.txt.gz', sep='\t', compression='gzip')
attributeslist

### Up Gene Set Library

In [None]:
with open(output_path+'gene_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == 1]])>= 5:
            print(attributes[i], '', *ternarymatrix.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Gene Set Library

In [None]:
with open(output_path+'gene_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrix.reset_index(drop=True).to_numpy(dtype=np.int_)
    attributes = ternarymatrix.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrix.index[arr[:, i] == -1]])>= 5:
            print(attributes[i], '', *ternarymatrix.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Up Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_up_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == 1]])>= 5:
            print(genes[i], '', *ternarymatrixT.index[arr[:, i] == 1], sep='\t', end='\n', file=f)

### Down Attribute Set Library

In [None]:
with open(output_path+'attribute_set_library_dn_crisp.gmt', 'w') as f:
    arr = ternarymatrixT.reset_index(drop=True).to_numpy(dtype=np.int_)
    genes = ternarymatrixT.columns

    w, h = arr.shape
    for i in tqdm(range(h)):
        if len([*ternarymatrixT.index[arr[:, i] == -1]])>= 5:
            print(genes[i],'', *ternarymatrixT.index[arr[:, i] == -1], sep='\t', end='\n', file=f)

### Gene Similarity Matrix

In [None]:
gene_similarity_matrix = dist.pdist(ternarymatrix.to_numpy(dtype=np.int_), 'cosine')
gene_similarity_matrix = dist.squareform(gene_similarity_matrix)
gene_similarity_matrix = 1 - gene_similarity_matrix

gene_similarity_matrix = pd.DataFrame(data=gene_similarity_matrix, index=ternarymatrix.index, columns=ternarymatrix.index)
gene_similarity_matrix.index.name = None
gene_similarity_matrix.columns.name = None
gene_similarity_matrix.to_csv(output_path+'gene_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
gene_similarity_matrix

### Attribute Similarity Matrix

In [None]:
attribute_similarity_matrix = dist.pdist(ternarymatrixT.to_numpy(dtype=np.int_), 'cosine')
attribute_similarity_matrix = dist.squareform(attribute_similarity_matrix)
attribute_similarity_matrix = 1 - attribute_similarity_matrix

attribute_similarity_matrix = pd.DataFrame(data=attribute_similarity_matrix, index=ternarymatrixT.index, columns=ternarymatrixT.index)
attribute_similarity_matrix.index.name = None
attribute_similarity_matrix.columns.name = None
attribute_similarity_matrix.to_csv(output_path+'attribute_similarity_matrix_cosine.txt.gz', sep='\t', compression='gzip')
attribute_similarity_matrix

### Knowledge Graph Serialization

In [None]:
nodes = {}
edges = []

for gene in geneslist.index:
    gene = geneslist.loc[gene]
    nodes[int(gene['Gene ID'])] = {
        "type":"gene",
        "properties": {
            "id":int(gene['Gene ID']),
            "label":gene['Gene']
        }}

for drugpert in attributeslist.index:
    drugpert = attributeslist.loc[drugpert]
    nodes['RummaGEO_'+drugpert['Drug Perturbation']] = {
        "type":"drug perturbation",
        "properties": {
            "id":'RummaGEO_'+drugpert['Drug Perturbation'],
            "label":drugpert['Drug Perturbation']
        }}

for edge in edgelist.index:
    edge = edgelist.loc[edge]
    if edge['Threshold'] == 1:
        edges.append({
            "source": 'RummaGEO_' + edge['Drug Perturbation'],
            "relation": "increases expression of",
            "target": int(edge['Gene ID']),
            "properties":{
                "id": 'RummaGEO_' + edge['Drug Perturbation']+":"+str(edge['Gene ID']),
                "source_id":'RummaGEO_' + edge['Drug Perturbation'],
                "source_label":edge['Drug Perturbation'],
                "target_label":edge['Gene'],
                "target_id": int(edge['Gene ID']),
                "directed":True,
                "threshold":1
            }})
    else:
        edges.append({
            "source": 'RummaGEO_' + edge['Drug Perturbation'],
            "relation": "decreases expression of",
            "target": int(edge['Gene ID']),
            "properties":{
                "id": 'RummaGEO_' + edge['Drug Perturbation']+":"+str(edge['Gene ID']),
                "source_id":'RummaGEO_' + edge['Drug Perturbation'],
                "source_label":edge['Drug Perturbation'],
                "target_label":edge['Gene'],
                "target_id": int(edge['Gene ID']),
                "directed":True,
                "threshold":-1
            }})

#### RDF

In [None]:
with open(output_path+'kg_serializations/rummageodrug.rdf', 'w') as f:
    print('@prefix gene: ncbi.nlm.nih.gov/gene/', file=f)
    print('@prefix RO: purl.obolibrary.org/RO_', file=f)
    
    print('', file=f)
    for edge in edges:
        if edge['properties']['threshold'] == 1:
            print(edge['properties']['source_id'].replace('_',':',1), 'RO:0003003', 'gene:'+str(edge['properties']['target_id']), end=' .\n', file=f)
        else:
            print(edge['properties']['source_id'].replace('_',':',1), 'RO:0003002', 'gene:'+str(edge['properties']['target_id']), end=' .\n', file=f)

#### JSON

In [None]:
with open(output_path+'kg_serializations/rummageodrug.json', 'w') as f:
    serial = json.dump(
        {
            "Version":"1", 
            "nodes": nodes,
            "edges": edges
        }, indent=4, fp=f)

#### TSV

In [None]:
nodeframe = pd.DataFrame(nodes).T
nodeframe['id'] = nodeframe['properties'].apply(lambda x: x['id'])
nodeframe['label'] = nodeframe['properties'].apply(lambda x: x['label'])
nodeframe['namespace'] = nodeframe['type'].apply(lambda x: {'gene':'NCBI Entrez', 'drug perturbation':'RummaGEO'}[x])
nodeframe = nodeframe.get(['namespace', 'id', 'label']).reset_index(drop=True)
nodeframe.to_csv(output_path+'kg_serializations/rummageodrug_tsv/nodes.tsv', sep='\t')
nodeframe

In [None]:
edgeframe = pd.DataFrame(edges)
edgeframe['threshold'] = edgeframe['properties'].apply(lambda x: x['threshold'])
edgeframe = edgeframe.get(['source', 'relation', 'target', 'threshold'])
edgeframe.to_csv(output_path+'kg_serializations/rummageodrug_tsv/edges.tsv', sep='\t')
edgeframe

## Create Visualizations

### Gene Attribute Heat Map

In [None]:
sns.clustermap(ternarymatrix, cmap='seismic', center=0, square=True, xticklabels=False, yticklabels=False)

### Gene Similarity Clustered Heatmap

In [None]:
sns.clustermap(gene_similarity_matrix, cmap='seismic', center=0)

### Attribute Similarity Clustered Heatmap

In [None]:
sns.clustermap(attribute_similarity_matrix, cmap='seismic', center=0, xticklabels=False, yticklabels=False)

### UMAP

In [None]:
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        if 'consensus' not in term:
            term = f'{term}_up'
        gmt[term] = ' '.join(set(geneset))
    return gmt
libdict = load_gmt(open('downloads/gene_set_library_up_crisp.gmt', 'r'))
def load_gmt(file):
    gmt = OrderedDict()
    for line in file:
        term, blank, *geneset = line.strip().split('\t')
        gmt[f'{term}_down'] = ' '.join(set(geneset))
    return gmt
downlibdict = load_gmt(open('downloads/gene_set_library_dn_crisp.gmt', 'r'))
libdict.update(downlibdict)
scatterdir = 'images/'

In [None]:
def process_scatterplot(libdict, nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1):
    print("\tTF-IDF vectorizing gene set data...")
    vec = TfidfVectorizer(max_df=maxdf, min_df=mindf)
    X = vec.fit_transform(libdict.values())
    print(X.shape)
    adata = anndata.AnnData(X)
    adata.obs.index = libdict.keys()

    print("\tPerforming Leiden clustering...")
    ### the n_neighbors and min_dist parameters can be altered
    sc.pp.neighbors(adata, n_neighbors=nneighbors, use_rep='X')
    sc.tl.leiden(adata, resolution=1.0)
    sc.tl.umap(adata, min_dist=mindist, spread=spread, random_state=42)

    new_order = adata.obs.sort_values(by='leiden').index.tolist()
    adata = adata[new_order, :]
    adata.obs['leiden'] = 'Cluster ' + adata.obs['leiden'].astype('object')

    df = pd.DataFrame(adata.obsm['X_umap'])
    df.columns = ['x', 'y']

    df['cluster'] = adata.obs['leiden'].values
    df['term'] = adata.obs.index
    df['genes'] = [libdict[l] for l in df['term']]

    return df

In [None]:
def get_scatter_colors(df):
    clusters = pd.unique(df['cluster']).tolist()
    colors = list(Category20[20])[::2] + list(Category20[20])[1::2]
    color_mapper = {clusters[i]: colors[i % 20] for i in range(len(clusters))}
    return color_mapper

def get_scatterplot(scatterdf):
    df = scatterdf.copy()
    color_mapper = get_scatter_colors(df)
    df['color'] = df['cluster'].apply(lambda x: color_mapper[x])

    hover_emb = HoverTool(name="df", tooltips="""
        <div style="margin: 10">
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Gene Set:</span>
                <span style="font-size: 12px">@gene_set</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Coordinates:</span>
                <span style="font-size: 12px">(@x,@y)</span>
            <div style="margin: 0 auto; width:300px;">
                <span style="font-size: 12px; font-weight: bold;">Cluster:</span>
                <span style="font-size: 12px">@cluster</span>
            </div>
        </div>
    """)
    tools_emb = [hover_emb, 'pan', 'wheel_zoom', 'reset', 'save']

    plot_emb = figure(
        width=1000, 
        height=700, 
        tools=tools_emb
    )

    source = ColumnDataSource(
        data=dict(
            x = df['x'],
            y = df['y'],
            gene_set = df['term'],
            cluster = df['cluster'],
            colors = df['color'],
            label = df['cluster']
        )
    )

    # hide axis labels and grid lines
    plot_emb.xaxis.major_tick_line_color = None
    plot_emb.xaxis.minor_tick_line_color = None
    plot_emb.yaxis.major_tick_line_color = None
    plot_emb.yaxis.minor_tick_line_color = None
    plot_emb.xaxis.major_label_text_font_size = '0pt'
    plot_emb.yaxis.major_label_text_font_size = '0pt' 

    plot_emb.output_backend = "svg"    
    
    plot_emb.title = 'Gene Sets in the RummaGEO Drug Perturbation Signatures Library'
    plot_emb.xaxis.axis_label = "UMAP_1"
    plot_emb.yaxis.axis_label = "UMAP_2"
    plot_emb.xaxis.axis_label_text_font_style = 'normal'
    plot_emb.xaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_size = '18px'
    plot_emb.yaxis.axis_label_text_font_style = 'normal'
    plot_emb.title.align = 'center'
    plot_emb.title.text_font_size = '18px'
    
    s = plot_emb.scatter(
        'x', 
        'y', 
        size = 4, 
        source = source, 
        color = 'colors'
    )
    
    return plot_emb

In [None]:
## defaults: nneighbors=30, mindist=0.1, spread=1.0, maxdf=1.0, mindf=1
scatter_df = process_scatterplot(libdict, nneighbors=30,mindist=0.1
     ,spread=1
     ,maxdf=1.0
     ,mindf=2
)

# Display Scatter Plot
plot = get_scatterplot(scatter_df)
output_notebook()
show(plot)

In [None]:
output_file(filename=f"{scatterdir}/umap.html", title = 'Gene Sets in RummaGEO Drug Perturbation Signatures Library')
save(plot)