In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import csv
import requests
import os

In [2]:
coexpression = pd.read_feather('https://s3.amazonaws.com/mssm-data/human_correlation_archs4.f')

In [3]:
coexpression.index = coexpression.columns

In [4]:
coexpression.shape

(26415, 26415)

In [5]:
genes = {}
for filename in glob('out/sab/*Gene.nodes.csv'):
	df = pd.read_csv(filename, index_col=0)
	if 'HGNC' in df.columns:
		for i, row in tqdm(df.iterrows()):
			if i not in genes:
				genes[i] = {}
			genes[i] = {**genes[i], **row}
	else: 
		print(filename)


15807it [00:00, 43352.10it/s]
13375it [00:00, 44262.42it/s]
3983it [00:00, 40992.58it/s]
4419it [00:00, 43333.66it/s]
34579it [00:00, 43903.37it/s]
677it [00:00, 38841.46it/s]
4547it [00:00, 41155.23it/s]
49987it [00:01, 47631.76it/s]


out/sab/MOTRPAC.Gene.nodes.csv


20997it [00:00, 41167.30it/s]
301it [00:00, 37739.08it/s]
34702it [00:00, 45659.38it/s]
1061it [00:00, 42530.29it/s]


out/sab/NPO.Gene.nodes.csv


8334it [00:00, 53599.32it/s]


In [6]:
len(genes)

57541

In [7]:
gene_names = set(i['label'] for i in genes.values())

In [8]:
len(gene_names)

56635

In [9]:
human_genes = gene_names.intersection(coexpression.columns)

In [10]:
len(human_genes)

20998

In [11]:
genes_df = pd.DataFrame.from_dict(genes, orient='index')

In [12]:
human_genes_df = genes_df[(genes_df.label.isin(human_genes)) & (~genes_df.HGNC.isna())]

In [13]:
human_genes_df.shape

(20997, 8)

In [14]:
gene_id = {}
for i,row in human_genes_df.iterrows():
	gene_id[row['label']] = i

In [15]:
for i in glob("out/sab/*predicted*"):
	print(i)

In [16]:
filtered_coex = coexpression.loc[human_genes_df.label, human_genes_df.label]

In [17]:
relation = "ARCHS4_coexpressed_genes"
with open("out/SAB/ARCHS4.edges.csv", "w") as o:
	csv_writer = csv.writer(o)
	csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "hidden"])
	index = 0
	for uid, row in tqdm(human_genes_df.iterrows()):
		gene = row['label']
		val = coexpression[gene].sort_values(ascending=False)[1:]
		count = 0
		for coex, val in val.items():
			if coex != gene and coex in gene_id:
				cid = gene_id[coex]
				csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
				count += 1
			if count == 10:
				break
		index += 1




20997it [00:49, 428.48it/s]


In [18]:
human_genes_df.to_csv("out/SAB/ARCHS4.Gene.nodes.csv")

In [19]:
import zlib
import zipfile

def compress(file_names, zip_name):
    print("File Paths:")
    print(file_names)

    path = "out/sab/"

    # Select the compression mode ZIP_DEFLATED for compression
    # or zipfile.ZIP_STORED to just store the file
    compression = zipfile.ZIP_DEFLATED

    # create the zip file first parameter path/name, second mode
    zf = zipfile.ZipFile("out/compressed/%s.zip"%zip_name, mode="w")
    try:
        for file_name in file_names:
            # Add file to the zip file
            # first parameter file to zip, second filename in zip
            zf.write(path + file_name, file_name, compress_type=compression)

    except FileNotFoundError:
        print("An error occurred")
    finally:
        # Don't forget to close the file!
        zf.close()

In [58]:
compress(["ARCHS4.Gene.nodes.csv", "ARCHS4.edges.csv"], "ARCHS4")

File Paths:
['ARCHS4.Gene.nodes.csv', 'ARCHS4.edges.csv']


## Predicted Links

In [20]:
def fetch_and_save_library(library, file):
  ''' Download file from {url}, save it to {file}, and subsequently read it with {reader} using pandas options on {**kwargs}.
  '''
  if not os.path.exists(file):
    if os.path.dirname(file):
      os.makedirs(os.path.dirname(file), exist_ok=True)
    gmt_url = "https://maayanlab.cloud/Enrichr/geneSetLibrary?mode=text&libraryName=%s"%library
    res = requests.get(gmt_url)
    gmt = res.text
    with open(file, 'w') as o:
        o.write(gmt)
  
  with open(file) as o:
    return o.read().strip().split("\n")

In [21]:
library = "LINCS_L1000_Chem_Pert_Consensus_Sigs"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [22]:
up_gene_sets = {}
down_gene_sets = {}
for line in gmt:
	annot, _ , *genes = line.split("\t")
	direction = "Up" if " Up" in annot else "Down"
	label = annot.replace(" %s"%direction, "")
	if direction == "Up":
		up_gene_sets[label] = genes
	if direction == "Down":
		down_gene_sets[label] = genes
len(up_gene_sets), len(down_gene_sets)

(5425, 5425)

In [23]:
nodes_df = pd.read_csv('out/sab/LINCS.Compound.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM
83c01dbd-ecc3-593a-8b51-5b6d2ac2b4d9,Lucitanib,Compound,PUBCHEM:25031915
f1bfe06d-6336-5fb6-a0a3-0e6f8b72463f,vismodegib,Compound,PUBCHEM:24776445
a817b1e4-32c7-556c-9eb4-286b40acd9af,indirubin,Compound,PUBCHEM:5318433
0dd385a8-9364-5554-b32d-d79b7573f9b7,"(+)-4-((alpha R)-((2S,5R)-4-allyl-2,5-dimethyl...",Compound,PUBCHEM:123924
09e0a3d6-d9f5-5118-8929-a46472085648,actinomycin-d,Compound,PUBCHEM:2019


In [24]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
len(node_id_mapper), nodes_df.shape

(4518, (4523, 3))

In [25]:
gene_sets = up_gene_sets
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(3341, 2084)

In [26]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "hi"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_LINCS_Up"
	for label, genes in up_gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1
	
	relation = "ARCHS4_coexpressed_genes_LINCS_Down"
	for label, genes in down_gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## MW

In [27]:
library = "Metabolomics_Workbench_Metabolites_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [28]:
nodes_df = pd.read_csv('out/sab/MW.Metabolite.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM,HMDB,PUBMED
1482623c-3863-5b24-98f3-8f2bc7ba2d0d,trans-Zeatin,Metabolite,PUBCHEM:449093,,
81320127-4c07-5fdd-b96c-ee3160837e79,urea,Metabolite,PUBCHEM:1176,,
28a2d30c-d88c-575c-894f-e50ad71a17a9,Phe-Thr,Metabolite,PUBCHEM:145453498,,
ae0c3a8a-dc2b-5e88-aba3-22bbb4d6d40e,Dynorphin A,Metabolite,PUBCHEM:53481554,,
55f2e909-bf87-57d9-bce2-e734969adc66,clorazepate,Metabolite,PUBCHEM:2809,,


In [29]:
gene_sets = {}
for line in gmt:
	label, _ , *genes = line.split("\t")
	direction = "Up" if " Up" in annot else "Down"
	gene_sets[label] = genes
len(gene_sets)

233

In [30]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)

missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



8912 (9627, 5)


(156, 77)

In [31]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_MW"
	for label, genes in gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## GTEx
GTex Signatures have multiple context. Should we have multiple edges or consensus?

In [54]:
library = "GTEx_Tissues_V8_2023"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [70]:
import re

In [72]:
# Issue: Different Context, Should I have consensus sigs????
gene_sets = {}
dirs = set()
pattern = "(?P<tissue>.*) (?P<sex>.*) (?P<age>[0-9]*-[0-9]*) (?P<direction>.*)"
for line in gmt:
	annot, _ , *genes = line.split("\t")
	direction = "Up" if " Up" in annot else "Down"
	dirs.add(direction)
	label = annot.replace(" %s"%direction, "")
	gene_sets[label] = genes
len(gene_sets)

511

In [75]:
annot = "Bladder Bo Female 40-49 Up"
re.match(pattern, annot).groupdict()

{'tissue': 'Bladder Bo', 'sex': 'Female', 'age': '40-49', 'direction': 'Up'}

In [55]:
nodes_df = pd.read_csv('out/sab/GTEXEQTL.Anatomy.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,LNC,UBERON,FMA,NCI,CHV,UWDA,MSH,EFO
16a311cd-fb0f-5163-8979-3695d3683a4c,Subcutaneous Fat,Anatomy,,UBERON:2190,FMA:74315,C33645,CHV:21586,UWDA:78251,D050151,
714f924a-f3b5-51e1-a323-2a1b70dbf7d2,lymphoblast,Anatomy,MTHU015669,,FMA:83030,C13013,CHV:22371,,,
e186ef2e-b773-502f-87c6-d6f4606dbce1,Thyroid Gland,Anatomy,LP7636-6,UBERON:2046,FMA:9603,C160209,CHV:12262,UWDA:9603,D013961,
724b2390-7f15-5de7-ab1c-094b1f2c88cb,Myocardium of left ventricle,Anatomy,,UBERON:6566,FMA:9558,,,UWDA:9558,,
5f9a4728-3943-58de-842a-dada7e9cced4,lower leg skin,Anatomy,,UBERON:4264,,,,,,


In [57]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)

48 (48, 10)


In [58]:
nodes_df = pd.read_csv('out/sab/GTEXEXP.Anatomy.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,LNC,UBERON,FMA,NCI,CHV,UWDA,MSH,EFO
16a311cd-fb0f-5163-8979-3695d3683a4c,Subcutaneous Fat,Anatomy,,UBERON:2190,FMA:74315,C33645,CHV:21586,UWDA:78251,D050151,
714f924a-f3b5-51e1-a323-2a1b70dbf7d2,lymphoblast,Anatomy,MTHU015669,,FMA:83030,C13013,CHV:22371,,,
e186ef2e-b773-502f-87c6-d6f4606dbce1,Thyroid Gland,Anatomy,LP7636-6,UBERON:2046,FMA:9603,C160209,CHV:12262,UWDA:9603,D013961,
724b2390-7f15-5de7-ab1c-094b1f2c88cb,Myocardium of left ventricle,Anatomy,,UBERON:6566,FMA:9558,,,UWDA:9558,,
7dfd85af-bcd2-5186-9915-4bd01eee3537,Ascending aorta structure,Anatomy,LP7010-4,UBERON:1496,FMA:3736,C32150,CHV:1499,UWDA:3736,D001013,


In [59]:
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)

53 (44, 10)


In [None]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



## IDG

In [32]:
library = "IDG_Drug_Targets_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [33]:
# Issue: Different Context, Should I have consensus sigs????
gene_sets = {}
dirs = set()
for line in gmt:
	label, _ , *genes = line.split("\t")
	gene_sets[label] = genes
len(gene_sets)

888

In [34]:
nodes_df = pd.read_csv('out/sab/IDGP.Compound.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM
a4b7096e-14a6-5b32-918d-e3bbfbce8549,(R)-N-(1-(1-(4-(3-cyano-1H-indazol-1-yl)-2-flu...,Compound,PUBCHEM:56676610
f44958e6-f175-5419-935f-22497387f77d,1-(2-Benzyl-allyl)-4-[2-[bis(4-fluorophenyl)me...,Compound,PUBCHEM:24825331
90ee6480-4a38-5997-b682-dc6b62f0094e,(S)-2-Acetylamino-4-methyl-pentanoic acid ((S)...,Compound,PUBCHEM:18458068
938524b6-fadb-5a3a-8587-fc6417860903,3-(2'-((2-chloro-2'-methyl-4'-(trifluoromethyl...,Compound,PUBCHEM:122190362
7f325250-92f8-519d-a51d-59ae53f2778f,"US9409907, 183",Compound,PUBCHEM:73386889


In [35]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



322568 (324293, 3)


In [36]:
nodes_df = pd.read_csv('out/sab/IDGP.Compound.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,PUBCHEM
a4b7096e-14a6-5b32-918d-e3bbfbce8549,(R)-N-(1-(1-(4-(3-cyano-1H-indazol-1-yl)-2-flu...,Compound,PUBCHEM:56676610
f44958e6-f175-5419-935f-22497387f77d,1-(2-Benzyl-allyl)-4-[2-[bis(4-fluorophenyl)me...,Compound,PUBCHEM:24825331
90ee6480-4a38-5997-b682-dc6b62f0094e,(S)-2-Acetylamino-4-methyl-pentanoic acid ((S)...,Compound,PUBCHEM:18458068
938524b6-fadb-5a3a-8587-fc6417860903,3-(2'-((2-chloro-2'-methyl-4'-(trifluoromethyl...,Compound,PUBCHEM:122190362
7f325250-92f8-519d-a51d-59ae53f2778f,"US9409907, 183",Compound,PUBCHEM:73386889


In [37]:
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper))



322568


In [38]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower()
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(444, 444)

In [39]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_IDG"
	for label, genes in gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## HuBMAP

In [40]:
library = "HuBMAP_ASCTplusB_augmented_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [41]:
gene_sets = {}
dirs = set()
for line in gmt:
	label, _ , *genes = line.split("\t")
	gene_sets[label] = genes
len(gene_sets)

777

In [42]:
label

'Hillock Cell Of Prostatic urethra - Prostate Gland'

In [43]:
gene_sets.keys()

dict_keys(['Paneth Cell Of Epithelium Proper Of Small intestine - Small Intestine', 'Duodenal Goblet cell - Small Intestine', 'Tuft cell - Small Intestine', 'absorptive - Small Intestine', 'enterocyte - Small Intestine', 'Endocrine Lineage cell - Small Intestine', 'Stem cell - Small Intestine', 'endothelial - Small Intestine', 'myofibroblast - Small Intestine', 'fibroblast - Small Intestine', 'B cell - Small Intestine', 'plasma - Small Intestine', 'CD4+ T - Small Intestine', 'Regulatory CD4+ T - Small Intestine', 'CD8+ T - Small Intestine', 'macrophage - Small Intestine', 'Mast cell - Small Intestine', 'fibroblasts - Small Intestine', 'Endothelial cells - Small Intestine', 'Epithelial Stem cells - Small Intestine', 'Paneth - Small Intestine', 'endocrine - Small Intestine', 'Jejunal Goblet cell - Small Intestine', 'enterocytes - Small Intestine', 'Tuft cells - Small Intestine', 'myofibroblasts - Small Intestine', 'macrophages - Small Intestine', 'Mast cells - Small Intestine', 'endoethl

In [44]:
nodes_df = pd.read_csv('out/sab/HMAZ.Anatomy.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,AZ,CL
0a62794d-65e7-5865-8884-873c47b601e9,Capillary Endothelial Cells,Anatomy,AZ:5,CL:2144
cdf14e95-744f-561a-877b-bd8b74822d73,innate lymphoid cell,Anatomy,AZ:9,CL:1065
e61fa17d-817a-541e-af7f-9ec8869a7972,Kidney_L3_T cell,Anatomy,AZ:69,
5cf92c83-33a9-5faf-b957-6e9880f6bb0a,alternatively activated macrophage,Anatomy,AZ:46,CL:890
0fa76653-3d12-551b-9cd9-f6aed515817d,macula densa epithelial cell,Anatomy,AZ:47,CL:1000850


In [45]:
node_id_mapper = {}
for k,v in nodes_df.label.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



92 (92, 4)


In [46]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower().split(" - ")[0]
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(37, 571)

In [47]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_HuBMAP"
	for label, genes in gene_sets.items():
		l = label.lower().split(" - ")[0]
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

## GlyGen

In [48]:
library = "GlyGen_Glycosylated_Proteins_2022"
gmt = fetch_and_save_library(library, "gmt/%s"%library)

In [49]:
gene_sets = {}
dirs = set()
for line in gmt:
	label, _ , *genes = line.split("\t")
	gene_sets[label] = genes
len(gene_sets)

338

In [50]:
nodes_df = pd.read_csv('out/sab/GLYCANS.Glytoucan.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,GLYTOUCAN
7c256cfc-61b7-5568-b997-eea76e864673,543.22,Glytoucan,G99760DU
d670b459-3855-567a-a8bb-22a923da9863,2296.82,Glytoucan,G29633ZY
0086247d-4539-5920-ae9a-259b5937d101,1031.35,Glytoucan,G51580UF
e52f9968-3f99-58d6-8403-28248acc3e03,1772.67,Glytoucan,G75594VJ
bd812aa8-eb47-583d-9e11-d2f496e674fd,383.14,Glytoucan,G67600YJ


In [51]:
node_id_mapper = {}
for k,v in nodes_df.GLYTOUCAN.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



33755 (33755, 3)


In [52]:
nodes_df = pd.read_csv('out/sab/PROTEOFORM.Glytoucan.nodes.csv', index_col=0)
nodes_df.head()

Unnamed: 0,label,type,GLYTOUCAN
deb8bfae-3697-5d85-90d3-9235a790e064,label:2749,Glytoucan,G61806WR
4a7e16fe-f74f-50c3-9606-94f68329f671,label:2151,Glytoucan,G78644BR
e149e361-6dbc-54a9-95cf-9fc6063fd933,label:2192,Glytoucan,G40124HY
f08e54f9-956b-5744-b79a-218ab3462f84,label:2224,Glytoucan,G83229XP
06936788-a612-5831-824a-e0e9acd8a7b4,label:2256,Glytoucan,G31615DN


In [53]:
for k,v in nodes_df.GLYTOUCAN.items():
	if v.lower() not in node_id_mapper:
		node_id_mapper[v.lower()] = k
print(len(node_id_mapper), nodes_df.shape)



33755 (1554, 3)


In [54]:
missing = set()
matched = set()
for l, genes in gene_sets.items():
	label = l.lower().split(" - ")[0]
	if label not in node_id_mapper:
		missing.add(label)
	else:
		matched.add(label)
len(matched), len(missing)



(338, 0)

In [55]:
with open("out/SAB/ARCHS4.edges.csv", "a") as o:
	csv_writer = csv.writer(o)
	# csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	relation = "ARCHS4_coexpressed_genes_Glygen"
	for label, genes in gene_sets.items():
		l = label.lower()
		if l in node_id_mapper:
			uid = node_id_mapper[l]
			matched_genes = list(set(genes).intersection(filtered_coex.index))
			mean_coex = filtered_coex[matched_genes].mean(1).sort_values(ascending=False)
			count = 0
			for coex, val in mean_coex.items():
				if coex != gene and coex in gene_id:
					cid = gene_id[coex]
					csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
					count += 1
				if count == 10:
					break
			index += 1

In [56]:
compress(["ARCHS4.Gene.nodes.csv", "ARCHS4.edges.csv"], "ARCHS4")

File Paths:
['ARCHS4.Gene.nodes.csv', 'ARCHS4.edges.csv']
