# Generating Evaluation Sets

Un-treated cell line profiles from L1000 (962 dimension) and ARCHS4 (35238 dimension)

In [113]:
import pandas as pd
import h5py
import random
import numpy as np
import numpy_indexed as npi
import time
import requests
import xmltodict
import json

In [2]:
ARCHS4_filename = "../data/ARCHS4/human_matrix_v9.h5"
ARCHS4_filtered_sample_output_filename = "../data/processed/ARCHS4/filtered_sample_list.txt"

l1000_all_gene_list = "../data/L1000/all_gene_list.txt"
l1000_landmark_gene_list = "../data/L1000/landmark_gene_list.txt"

archs4_all_gene_list = "../data/ARCHS4/all_gene_list.txt"

gtex_l1000_all_gene_list = "../data/GTEx/l1000_all_gene_list.txt"
gtex_l1000_landmark_gene_list = "../data/GTEx/l1000_landmark_gene_list.txt"

gtex_rnaseq_all_gene_list = "../data/GTEx/rnaseq_all_gene_list.txt"

In [12]:
cell_lines = ["MCF7", "PC3", "A375", "HEPG2", "VCAP"]
metasra_folder = "../data/Evaluation/MetaSRA/metaSRA-{}-{}.csv"

output filenames

In [249]:
geo_id_dict_filename = "../data/Evaluation/MetaSRA/geo_id_dict.pkl"

# Get overlap landmark genes

In [3]:
with open(l1000_landmark_gene_list, "r") as f:
    l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(archs4_all_gene_list, "r") as f:
    archs4_all_gene = [x.strip() for x in f.readlines()]
with open(gtex_l1000_landmark_gene_list, "r") as f:
    gtex_l1000_landmark_gene = [x.strip() for x in f.readlines()]
with open(gtex_rnaseq_all_gene_list, "r") as f:
    gtex_rnaseq_all_gene = [x.strip() for x in f.readlines()] 

In [4]:
overlap_landmark_genes = list(set(l1000_landmark_gene).intersection(archs4_all_gene).intersection(gtex_l1000_landmark_gene).intersection(gtex_rnaseq_all_gene))
overlap_rnaseq_genes = list(set(archs4_all_gene).intersection(gtex_rnaseq_all_gene)) # common genes in ARCHS4 and GTEx RNA-seq

In [7]:
# Import ARCHS4 RNA-seq samples 
print('Processing RNA-seq data.....')
h5 = h5py.File(ARCHS4_filename, 'r')
data_file = h5['data'] 
expression = data_file['expression']
genes = [x for x in h5['meta']['genes']['genes']]
sample_geo_list = list(h5['meta']['samples']['geo_accession'])

Processing RNA-seq data.....


# Read MetaSRA

In [61]:
def parse(input_list):
    all_unique_value = dict()
    for element in input_list:
        
        split = [x.strip() for x in element.split(";")]
        if len(split)>1:
            for e in split:
                subsplit = e.split(": ")
                if len(subsplit) > 1:
                    
                    all_unique_value[subsplit[0]] = subsplit[1]
    return all_unique_value

# SRA ID to GEO ID

In [244]:
def get_geo_id(sra_id):
    if sra_id.startswith("SRS"):
        trial = 0
        while True:
            try:
                trial += 1
                # get id
                sra_id_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term={sra_id}&retmode=json"
                result = eval(requests.post(sra_id_url).text)    
                id = result["esearchresult"]["idlist"][0]

                # get geo id
                sra_datafetch_url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=sra&id={id}&retmode=json"    
                obj = xmltodict.parse(requests.post(sra_datafetch_url).text)
                geo_id = obj["EXPERIMENT_PACKAGE_SET"]["EXPERIMENT_PACKAGE"]["EXPERIMENT"]["EXPERIMENT_ATTRIBUTES"]["EXPERIMENT_ATTRIBUTE"]["VALUE"]#["DESIGN"]["SAMPLE_DESCRIPTOR"]["@refname"]

                time.sleep(random.random())
                break
            except:
                if trial < 3:                    
                    time.sleep(random.random())
                else:
                    return ""
    
        print(sra_id, geo_id)
        return geo_id
    else:
        return ""

In [245]:
matched_geo_ids_for_cell_line = dict()
for cell_line in cell_lines:
    # for each cell line, load sample description file from metaSRA
    print(cell_line)
    metasra_runs = pd.read_csv(metasra_folder.format("runs", cell_line)) # sample ID and experiment ID
    metasra_samples = pd.read_csv(metasra_folder.format("samples", cell_line), index_col=0) # sample descriptions
    
    # ARCHS4 contains experiment ID, so map study ID to experiment ID
    sample_id_to_exp_id_dict = dict(zip(metasra_runs["sra_study_id"], metasra_runs["sra_experiment_id"]))
    
    # filtering non-treatment samples
    selected_metasra_samples = metasra_samples[metasra_samples["raw_SRA_metadata"].str.lower().str.contains("control|vehicle|dmso")]
    
    geo_ids = [get_geo_id(x) for x in selected_metasra_samples["sample_id"].tolist()]
    matched_geo_ids_for_cell_line[cell_line] = geo_ids
#     break

MCF7
SRS4101480 GSM3499305
SRS4101464 GSM3499289
SRS4101473 GSM3499298
SRS4101463 GSM3499288
SRS4101462 GSM3499287
SRS4101474 GSM3499299
SRS4101468 GSM3499293
SRS4101471 GSM3499296
SRS4101472 GSM3499297
SRS4101466 GSM3499291
SRS4101461 GSM3499286
SRS4101470 GSM3499295
SRS4101467 GSM3499292
SRS4101459 GSM3499284
SRS4101469 GSM3499294
SRS4101460 GSM3499285
SRS4101465 GSM3499290
SRS4101478 GSM3499302
SRS4101481 GSM3499306
SRS4101482 GSM3499307
SRS4101479 GSM3499304
SRS4101476 GSM3499301
SRS4101475 GSM3499300
SRS4101477 GSM3499303
SRS4257247 GSM3565534
SRS4257269 GSM3565556
SRS4257056 GSM3565366
SRS4257122 GSM3565409
SRS4257163 GSM3565450
SRS4257374 GSM3565661
SRS579419 GSM1354552
SRS579420 GSM1354553
SRS579430 GSM1354562
SRS579429 GSM1354563
SRS579435 GSM1354568
SRS579436 GSM1354569
SRS884217 GSM1643975
SRS884210 GSM1643981
SRS884212 GSM1643980
SRS884228 GSM1643964
SRS884227 GSM1643965
SRS884218 GSM1643974
SRS2211797 GSM2633721
SRS2211805 GSM2633729
SRS2211785 GSM2633709
SRS2211793 GSM263

In [246]:
matched_geo_ids_for_cell_line.keys()

dict_keys(['MCF7', 'PC3', 'A375', 'HEPG2', 'VCAP'])

# Save 

In [247]:
import pickle

In [250]:
with open(geo_id_dict_filename, "wb") as f:
    pickle.dump(matched_geo_ids_for_cell_line, f)