In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import csv
import requests
import os

In [2]:
coexpression = pd.read_feather('https://s3.amazonaws.com/mssm-data/human_correlation_archs4.f')

In [3]:
coexpression.index = coexpression.columns

In [4]:
coexpression.shape

(26415, 26415)

In [5]:
genes = {}
for filename in glob('out/sab/*Gene.nodes.csv'):
	df = pd.read_csv(filename, index_col=0)
	if 'HGNC' in df.columns:
		for i, row in tqdm(df.iterrows()):
			if i not in genes:
				genes[i] = {}
			genes[i] = {**genes[i], **row}
	else: 
		print(filename)


13801it [00:00, 43136.07it/s]
13375it [00:00, 44137.09it/s]
3982it [00:00, 40771.50it/s]
4419it [00:00, 41753.22it/s]
34579it [00:00, 37480.84it/s]
677it [00:00, 32733.63it/s]
4547it [00:00, 39372.04it/s]
49987it [00:01, 47117.54it/s]


out/sab/MOTRPAC.Gene.nodes.csv


301it [00:00, 33777.97it/s]
34702it [00:00, 43628.63it/s]
1061it [00:00, 40311.58it/s]

out/sab/NPO.Gene.nodes.csv





In [6]:
len(genes)

57393

In [7]:
gene_names = set(i['label'] for i in genes.values())

In [8]:
len(gene_names)

56534

In [9]:
human_genes = gene_names.intersection(coexpression.columns)

In [10]:
len(human_genes)

20998

In [11]:
genes_df = pd.DataFrame.from_dict(genes, orient='index')

In [12]:
human_genes_df = genes_df[(genes_df.label.isin(human_genes)) & (~genes_df.HGNC.isna())]

In [13]:
human_genes_df.shape

(20997, 8)

In [14]:
gene_id = {}
for i,row in human_genes_df.iterrows():
	gene_id[row['label']] = i

In [15]:
for i in glob("out/sab/*predicted*"):
	print(i)

In [16]:
filtered_coex = coexpression.loc[human_genes_df.label, human_genes_df.label]

In [17]:
relation = "ARCHS4_coexpressed_genes"
with open("out/SAB/ARCHS4.edges.csv", "w") as o:
	csv_writer = csv.writer(o)
	csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "hidden"])
	index = 0
	for uid, row in tqdm(human_genes_df.iterrows()):
		gene = row['label']
		val = coexpression[gene].sort_values(ascending=False)[1:]
		count = 0
		for coex, val in val.items():
			if coex != gene and coex in gene_id:
				cid = gene_id[coex]
				csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
				count += 1
			if count == 10:
				break
		index += 1




20997it [01:06, 315.36it/s]


In [18]:
human_genes_df.to_csv("out/SAB/ARCHS4.Gene.nodes.csv")

In [19]:
import zlib
import zipfile

def compress(file_names, zip_name):
    print("File Paths:")
    print(file_names)

    path = "out/sab/"

    # Select the compression mode ZIP_DEFLATED for compression
    # or zipfile.ZIP_STORED to just store the file
    compression = zipfile.ZIP_DEFLATED

    # create the zip file first parameter path/name, second mode
    zf = zipfile.ZipFile("out/compressed/%s.zip"%zip_name, mode="w")
    try:
        for file_name in file_names:
            # Add file to the zip file
            # first parameter file to zip, second filename in zip
            zf.write(path + file_name, file_name, compress_type=compression)

    except FileNotFoundError:
        print("An error occurred")
    finally:
        # Don't forget to close the file!
        zf.close()

In [58]:
compress(["ARCHS4.Gene.nodes.csv", "ARCHS4.edges.csv"], "ARCHS4")

File Paths:
['ARCHS4.Gene.nodes.csv', 'ARCHS4.edges.csv']


## Predicted Links

In [19]:
def fetch_and_save_library(library, file):
  ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
  '''
  if not os.path.exists(file):
    if os.path.dirname(file):
      os.makedirs(os.path.dirname(file), exist_ok=True)
    gmt_url = "https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=%s"%library
    res = requests.get(gmt_url)
    gmt = res.text
    with open(file, 'w') as o:
        o.write(gmt)
  
  with open(file) as o:
    return o.read().strip().split("\n")

In [20]:
library = "LINCS_L1000_Chem_Pert_Consensus_Sigs"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [21]:
up_gene_sets = {}
down_gene_sets = {}
for line in gmt:
	annot, _ , *genes = line.split("\t")
	direction = "Up" if " Up" in annot else "Down"
	label = annot.replace(" %s"%direction, "")
	if direction == "Up":
		up_gene_sets[label] = genes
	if direction == "Down":
		down_gene_sets[label] = genes
len(up_gene_sets), len(down_gene_sets)

(5425, 5425)

In [22]:
nodes_df = pd.read_csv('out/sab/LINCS.Compound.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM
d361115c-2fc0-5686-821c-180822af818e,2-{3-[4-(4-Fluoro-phenyl)-piperazin-1-yl]-prop...,Compound,60785
ae8ceddb-ce98-558f-ab31-69e61e168d30,SA-1939938,Compound,60184895
43393ef6-f13d-50f8-9c2b-12cf4957f0b6,MW-STK33-1C,Compound,5765514
a4e9db83-a66b-5821-8b84-a51cb934ee03,calcipotriol,Compound,5288783
168f06a8-6c5e-5583-95d1-b62210bec876,Triprolidine,Compound,5282443


In [23]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
len(node_id_mapper), nodes_df.shape

(4518, (4523, 3))

In [24]:
gene_sets = up_gene_sets
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(3341, 2084)

In [25]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "hi"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_LINCS_Up"
	for label, genes in up_gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1
	
	relation = "ARCHS4_coexpressed_genes_LINCS_Down"
	for label, genes in down_gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## MW

In [26]:
library = "Metabolomics_Workbench_Metabolites_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [27]:
nodes_df = pd.read_csv('out/sab/MW.Metabolite.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM,HMDB,PUBMED
d560db96-2ad7-5410-9015-f209f93be814,Procaine,Metabolite,4914.0,,
1d723a50-eddb-5946-ab30-95b9e6f7cf4e,"3,3'-Thiobispropanoic acid",Metabolite,8096.0,,
41242503-8812-5a54-800f-d7ddb050a62d,Cefazolin,Metabolite,33255.0,,
f9ea4ebd-0fcc-5d54-b0c5-86f9810079b5,hexanoylcarnitine,Metabolite,6426853.0,,
83b461e0-3807-5196-b679-eea7f78cf2c4,glaucarubinone,Metabolite,441796.0,,


In [28]:
gene_sets = {}
for line in gmt:
	label, _ , *genes = line.split("\t")
	direction = "Up" if " Up" in annot else "Down"
	gene_sets[label] = genes
len(gene_sets)

233

In [29]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)

missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



9007 (9725, 5)


(156, 77)

In [30]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_MW"
	for label, genes in gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## GTEx
GTex Signatures have multiple context. Should we have multiple edges or consensus?

In [31]:
library = "GTEx_Tissues_V8_2023"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [32]:
import re

In [33]:
# Issue: Different Context, Should I have consensus sigs????
gene_sets = {}
dirs = set()
pattern = "(?P<tissue>.*) (?P<sex>.*) (?P<age>[0-9]*-[0-9]*) (?P<direction>.*)"
for line in gmt:
	annot, _ , *genes = line.split("\t")
	direction = "Up" if " Up" in annot else "Down"
	dirs.add(direction)
	label = annot.replace(" %s"%direction, "")
	gene_sets[label] = genes
len(gene_sets)

511

In [34]:
annot = "Bladder Bo Female 40-49 Up"
re.match(pattern, annot).groupdict()

{'tissue': 'Bladder Bo', 'sex': 'Female', 'age': '40-49', 'direction': 'Up'}

In [35]:
nodes_df = pd.read_csv('out/sab/GTEXEQTL.Anatomy.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,LNC,UBERON,FMA,NCI,SNOMEDCT_US,CHV,UWDA,MSH,EFO
dc62d6d3-7d4d-56c1-97ef-f2087c46d03d,Hypothalamic structure,Anatomy,,1898.0,62008.0,C12458,67923007.0,6498.0,,D007031,
4e273fbd-7db9-56d9-ad00-de7e246f33fc,Structure of putamen,Anatomy,,1874.0,61834.0,C12452,89278009.0,10426.0,,D011699,
5ca230ff-6fcf-585a-a008-2ac2b7c3452f,Body of pancreas,Anatomy,,1150.0,14518.0,C12270,40133006.0,22141.0,14518.0,,
8ac4cdaa-ad7a-5652-9fb5-aa8af53480f8,Esophageal Squamous Epithelium,Anatomy,,6920.0,,C49222,,,,,
e186ef2e-b773-502f-87c6-d6f4606dbce1,Thyroid Gland,Anatomy,LP7636-6,2046.0,9603.0,C160209,69748006.0,12262.0,9603.0,D013961,


In [36]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)

48 (48, 11)


In [37]:
nodes_df = pd.read_csv('out/sab/GTEXEXP.Anatomy.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,LNC,UBERON,FMA,NCI,SNOMEDCT_US,CHV,UWDA,MSH,EFO
dc62d6d3-7d4d-56c1-97ef-f2087c46d03d,Hypothalamic structure,Anatomy,,1898.0,62008.0,C12458,67923007.0,6498.0,,D007031,
5ca230ff-6fcf-585a-a008-2ac2b7c3452f,Body of pancreas,Anatomy,,1150.0,14518.0,C12270,40133006.0,22141.0,14518.0,,
8ac4cdaa-ad7a-5652-9fb5-aa8af53480f8,Esophageal Squamous Epithelium,Anatomy,,6920.0,,C49222,,,,,
8ed9557e-1f3e-5cb0-9c30-52ef753bf355,Pituitary Gland,Anatomy,LP113983-3,7.0,13889.0,C12399,56329008.0,9761.0,13889.0,D010902,
e186ef2e-b773-502f-87c6-d6f4606dbce1,Thyroid Gland,Anatomy,LP7636-6,2046.0,9603.0,C160209,69748006.0,12262.0,9603.0,D013961,


In [38]:
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)

53 (44, 11)


In [39]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(0, 511)

## IDG

In [40]:
library = "IDG_Drug_Targets_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [41]:
# Issue: Different Context, Should I have consensus sigs????
gene_sets = {}
dirs = set()
for line in gmt:
	label, _ , *genes = line.split("\t")
	gene_sets[label] = genes
len(gene_sets)

888

In [42]:
nodes_df = pd.read_csv('out/sab/IDGP.Compound.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM
e69476ca-67a9-5b04-8f2d-9618e54f8b66,4-chloro-2-fluoro-5-(4-(3-fluorophenyl)-4-(2-(...,Compound,54580324
a4b29d0d-de05-57f8-b596-992fb0898185,1-(3-fluorobenzyl)-3-(1-((1-(4-(trifluoromethy...,Compound,11712583
57eb7630-1925-5e09-b134-bcc47ead1d99,"5-Sulfamoyl-thieno[2,3-b]thiophene-2-carboxyli...",Compound,14700212
db4b22a5-7932-56ea-92c9-f88277000de6,"US9278954, 1.65",Compound,118919539
b80140a4-a8d1-5941-bcf5-9fc0602cd840,2-(4-tert-butylphenyl)-4-(4-((1-methyl-1H-imid...,Compound,11626096


In [43]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



322568 (324293, 3)


In [44]:
nodes_df = pd.read_csv('out/sab/IDGD.Compound.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM
3c84c739-0851-5512-b041-af9bb76d18f9,Thiocolchicoside,Compound,9915886
31b33745-5c09-5ba9-9752-b96a9f51f946,Darolutamide,Compound,67171867
41242503-8812-5a54-800f-d7ddb050a62d,Cefazolin,Compound,33255
b93561c4-1424-5689-85ef-389e8cd7b0b1,netarsudil,Compound,66599893
e8ead698-de45-5008-b25f-9bb387a98ae6,Xipamide,Compound,26618


In [45]:
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper))



323556


In [46]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(514, 374)

In [47]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_IDG"
	for label, genes in gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## HuBMAP

In [48]:
library = "HuBMAP_ASCTplusB_augmented_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [49]:
gene_sets = {}
dirs = set()
for line in gmt:
	label, _ , *genes = line.split("\t")
	gene_sets[label] = genes
len(gene_sets)

777

In [50]:
label

'Hillock Cell Of Prostatic urethra - Prostate Gland'

In [51]:
gene_sets.keys()

dict_keys(['Paneth Cell Of Epithelium Proper Of Small intestine - Small Intestine', 'Duodenal Goblet cell - Small Intestine', 'Tuft cell - Small Intestine', 'absorptive - Small Intestine', 'enterocyte - Small Intestine', 'Endocrine Lineage cell - Small Intestine', 'Stem cell - Small Intestine', 'endothelial - Small Intestine', 'myofibroblast - Small Intestine', 'fibroblast - Small Intestine', 'B cell - Small Intestine', 'plasma - Small Intestine', 'CD4+ T - Small Intestine', 'Regulatory CD4+ T - Small Intestine', 'CD8+ T - Small Intestine', 'macrophage - Small Intestine', 'Mast cell - Small Intestine', 'fibroblasts - Small Intestine', 'Endothelial cells - Small Intestine', 'Epithelial Stem cells - Small Intestine', 'Paneth - Small Intestine', 'endocrine - Small Intestine', 'Jejunal Goblet cell - Small Intestine', 'enterocytes - Small Intestine', 'Tuft cells - Small Intestine', 'myofibroblasts - Small Intestine', 'macrophages - Small Intestine', 'Mast cells - Small Intestine', 'endoethl

In [52]:
nodes_df = pd.read_csv('out/sab/HMAZ.Anatomy.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,AZ,CL
9412f8ee-f98c-5f7d-bfa7-700a765e3100,non-classical monocyte,Anatomy,55,875.0
3ccdfeb4-78e8-5e6f-8aa8-297ce6a03ef5,fibroblast,Anatomy,41,57.0
ae7fc97c-9139-5ccf-9700-1a01f48e82d1,kidney outer medulla collecting duct principal...,Anatomy,57,1000716.0
55efbc35-3f92-5cc5-a457-d4a15043f889,kidney cortex collecting duct intercalated cell,Anatomy,30,1000715.0
5d35b5d4-8925-5c9f-99eb-a63326686223,Regular atrial cardiac myocyte,Anatomy,3,2129.0


In [53]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



92 (92, 4)


In [54]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower().split(" - ")[0]
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(36, 572)

In [55]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_HuBMAP"
	for label, genes in gene_sets.items():
		l = label.lower().split(" - ")[0]
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## GlyGen

In [56]:
library = "GlyGen_Glycosylated_Proteins_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [57]:
gene_sets = {}
dirs = set()
for line in gmt:
	label, _ , *genes = line.split("\t")
	gene_sets[label] = genes
len(gene_sets)

338

In [58]:
nodes_df = pd.read_csv('out/sab/GLYCANS.Glytoucan.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,GLYTOUCAN
7fd257c7-6899-5323-b2ba-08ee6765425c,1314.43,Glytoucan,G27172ZA
5833bb06-4b85-5c27-8930-5de7bafc6ac8,1786.65,Glytoucan,G42824SU
24e5b929-9448-57d5-a50f-a97756888e66,2209.82,Glytoucan,G99691UT
34af2b59-996b-5968-b248-03f867602db5,1347.48,Glytoucan,G28565EX
d0b1ef0c-7bb3-56d4-ad72-10d1d4e4a9f1,2370.88,Glytoucan,G66689PB


In [59]:
node_id_mapper = {}
for k,v in nodes_df.GLYTOUCAN.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



33755 (33755, 3)


In [60]:
nodes_df = pd.read_csv('out/sab/PROTEOFORM.Glytoucan.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,GLYTOUCAN
6293c02a-b11b-51d2-a63c-c8bcbcf94ef6,2222.78,Glytoucan,G59616TT
06140b52-f59e-5361-b7dc-ac172d3716c9,586.22,Glytoucan,G32550BI
b30bb963-08c1-52e0-81ac-2d5f84d346c5,1462.54,Glytoucan,G80858MF
d215eed3-8336-5711-ac14-1f251d7fcae1,2475.89,Glytoucan,G60743GT
853af05a-dca0-5ca4-a8ff-fbfd0f22c8f9,3755.33,Glytoucan,G90784AP


In [61]:
for k,v in nodes_df.GLYTOUCAN.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



33755 (1554, 3)


In [62]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower().split(" - ")[0]
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(338, 0)

In [63]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_Glygen"
	for label, genes in gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

In [64]:
compress(["ARCHS4.Gene.nodes.csv", "ARCHS4.edges.csv"], "ARCHS4")

NameError: name 'compress' is not defined

In [65]:
df = pd.read_csv("out/sab/ARCHS4.edges.csv")
df = df[['source', 'target', 'relation', 'SAB', 'evidence_class',
       'hidden']]
df.head()

Unnamed: 0,source,target,relation,SAB,evidence_class,hidden
0,2bbf802a-884e-5db4-a738-c858576cc15f,daa8439b-908e-5074-a367-a3c60cba1ddb,ARCHS4_coexpressed_genes,ARCHS4,0.49602,True
1,2bbf802a-884e-5db4-a738-c858576cc15f,c96cb7ae-137e-53c5-8eec-116e6a4cd154,ARCHS4_coexpressed_genes,ARCHS4,0.494161,True
2,2bbf802a-884e-5db4-a738-c858576cc15f,0feee71b-83da-5cb5-830d-2f08f52b58bb,ARCHS4_coexpressed_genes,ARCHS4,0.438561,True
3,2bbf802a-884e-5db4-a738-c858576cc15f,07976098-bb80-5780-8c58-82d929750767,ARCHS4_coexpressed_genes,ARCHS4,0.438128,True
4,2bbf802a-884e-5db4-a738-c858576cc15f,3e0ca18d-2ad1-5511-8f19-26e41b343c96,ARCHS4_coexpressed_genes,ARCHS4,0.426753,True


In [66]:
df["hidden_tag"] = ''
for i,row in df.iterrows():
	relation = row["relation"]
	if relation == 'ARCHS4_coexpressed_genes':
		df.at[i,"hidden_tag"] = "gene-gene coexpression"
	else:
		df.at[i,"hidden_tag"] = "predicted term-gene association via coexpression"

In [67]:
df.to_csv("out/sab/ARCHS4.edges.csv")