In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import csv

In [2]:
coexpression = pd.read_feather('https://s3.amazonaws.com/mssm-data/human_correlation_archs4.f')

In [3]:
coexpression.index = coexpression.columns

In [4]:
coexpression.shape

(26415, 26415)

In [5]:
genes = {}
for filename in glob('out/sab/*Gene.nodes.csv'):
	df = pd.read_csv(filename, index_col=0)
	if 'HGNC' in df.columns:
		for i, row in tqdm(df.iterrows()):
			if i not in genes:
				genes[i] = {}
			genes[i] = {**genes[i], **row}
	else: 
		print(filename)


15807it [00:00, 43168.48it/s]
13375it [00:00, 41967.92it/s]
3983it [00:00, 39907.39it/s]
4419it [00:00, 39820.88it/s]
34579it [00:00, 35515.94it/s]
677it [00:00, 36933.79it/s]
4547it [00:00, 35751.64it/s]
49987it [00:01, 46975.46it/s]


out/sab/MOTRPAC.Gene.nodes.csv


301it [00:00, 35361.76it/s]
34702it [00:00, 42026.29it/s]
1061it [00:00, 39046.39it/s]


out/sab/NPO.Gene.nodes.csv


8334it [00:00, 49844.19it/s]


In [6]:
len(genes)

57541

In [7]:
gene_names = set(i['label'] for i in genes.values())

In [8]:
len(gene_names)

56635

In [9]:
human_genes = gene_names.intersection(coexpression.columns)

In [10]:
len(human_genes)

20998

In [11]:
genes_df = pd.DataFrame.from_dict(genes, orient='index')

In [12]:
human_genes_df = genes_df[(genes_df.label.isin(human_genes)) & (~genes_df.HGNC.isna())]

In [13]:
human_genes_df.shape

(20997, 8)

In [14]:
gene_id = {}
for i,row in human_genes_df.iterrows():
	gene_id[row['label']] = i

In [15]:
relation = "predicted_ARCHS4_coexpression"
with open("out/SAB/ARCHS4.edges.csv", "w") as o:
	csv_writer = csv.writer(o)
	csv_writer.writerow(["", "source", "target", "relation", "SAB", "evidence_class", "predicted"])
	index = 0
	for uid, row in tqdm(human_genes_df.iterrows()):
		gene = row['label']
		val = coexpression[gene].sort_values(ascending=False)[1:]
		count = 0
		for coex, val in val.items():
			if coex != gene and coex in gene_id:
				cid = gene_id[coex]
				csv_writer.writerow([index, uid, cid, relation, 'ARCHS4', val, "true"])
				count += 1
			if count == 10:
				break
		index += 1




20997it [01:23, 252.53it/s]


In [18]:
human_genes_df.to_csv("out/SAB/ARCHS4.Gene.nodes.csv")

In [19]:
import zlib
import zipfile

def compress(file_names, zip_name):
    print("File Paths:")
    print(file_names)

    path = "out/sab/"

    # Select the compression mode ZIP_DEFLATED for compression
    # or zipfile.ZIP_STORED to just store the file
    compression = zipfile.ZIP_DEFLATED

    # create the zip file first parameter path/name, second mode
    zf = zipfile.ZipFile("out/compressed/%s.zip"%zip_name, mode="w")
    try:
        for file_name in file_names:
            # Add file to the zip file
            # first parameter file to zip, second filename in zip
            zf.write(path + file_name, file_name, compress_type=compression)

    except FileNotFoundError:
        print("An error occurred")
    finally:
        # Don't forget to close the file!
        zf.close()

In [21]:
compress(["ARCHS4.Gene.nodes.csv", "ARCHS4.edges.csv"], "ARCHS4")

File Paths:
['ARCHS4.Gene.nodes.csv', 'ARCHS4.edges.csv']
