In [1]:
#Jill E. Moore
#Moore Lab - UMass Chan Medical School
#May 2022

import os
import json
import urllib
from urllib import request, parse, error
import requests
import base64
from statistics import mean, median

In [2]:
def Process_Token():
    credentials=open("/home/moorej3/.encode.txt")
    credArray=next(credentials).rstrip().split("\t")
    return credArray[0], credArray[1]

In [3]:
def Extract_Experiment_Metatdata(dataset, creds): #extracts experiment metadata
    try:
        dataDir="/data/projects/encode/json/exps/"+dataset
        json_data=open(dataDir+".json").read()
        data = json.loads(json_data)
    except:
        url = "https://www.encodeproject.org/"+dataset+"/?format=json"
        request = urllib.request.Request(url)
        request.add_header("Authorization", "Basic %s" % creds)
        response = urllib.request.urlopen(request)
        data = json.loads(response.read())

    #assayTermName = data["assay_term_name"]
    assayTermName = data["assay_title"]
    assayType = data["assay_slims"][0]
    
    try:
        biosample = data["biosample_summary"].replace("nuclear fraction ","")
    except:
        biosample = "NA"
        
    try:
        tissue = data["biosample_ontology"]["term_name"]
    except:
        
        url = "https://www.encodeproject.org/"+dataset+"/?format=json"
        request = urllib.request.Request(url)
        request.add_header("Authorization", "Basic %s" % creds)
        response = urllib.request.urlopen(request)
        data = json.loads(response.read())
        
        tissue = data["biosample_ontology"]["term_name"]
    
    donors = []
    treatments = []
    for entry in data["replicates"]:
        donors.append(entry["library"]["biosample"]["donor"]["accession"])
        try:
            for t in entry["library"]["biosample"]["treatments"]:
                treatments.append(t["treatment_term_name"])
        except:
            pass 
    donor = ",".join(list(set(donors)))
    treatment = ",".join(list(set(treatments)))
    
    status=data["status"]
    
    targets = []
    if "target" in data:
        targets.append(data["target"]["label"])
    
    numberReads = []
    
    for entry in data["files"]: #loops through files associated with experiment
        try:
            if entry["file_type"] == "fastq" and entry["status"] == "released":

                replicate = ",".join([str(i) for i in entry["biological_replicates"]])
                numReads = entry["read_count"]
                numberReads.append(numReads)
        except:
            print(dataset)
    #numberReads = list(set(numberReads))
    return assayTermName, assayType, biosample, tissue, donor, treatment, status, numberReads, targets

In [4]:
usrname, psswd = Process_Token()
base64string = base64.b64encode(bytes('%s:%s' % (usrname,psswd),'ascii'))
creds = base64string.decode('utf-8')

## ENCODE Phase IV

### ENCODE4 - Human & Mouse

In [13]:
## Parameters
species1 = "Homo+sapiens"
species2 = "Mus+musculus"
rfa="ENCODE4"

## Build query
urlMain = "https://www.encodeproject.org/search/?type=Experiment&" + \
    "type=FunctionalCharacterizationExperiment&" + \
    "control_type!=*&status=released&award.rfa=" + rfa + \
    "&replicates.library.biosample.donor.organism.scientific_name=" + species1 + \
    "&replicates.library.biosample.donor.organism.scientific_name=" + species2 + \
    "&format=json&limit=all" #update limit=all for all datasets

request = urllib.request.Request(urlMain)
request.add_header("Authorization", "Basic %s" % creds)
response = urllib.request.urlopen(request)
data = json.loads(response.read())

biosampleList = []
tissueList = []
experimentDict = {}

for entry in data["@graph"]: #loops through experiments
    experiment = entry["accession"]
    assayTermName, assayType, biosample, tissue, donor, treatment, status, numberReads, targets = \
        Extract_Experiment_Metatdata(experiment, creds)
    if assayType not in experimentDict:
        experimentDict[assayType] = {}
    if assayTermName not in experimentDict[assayType]:
        experimentDict[assayType][assayTermName] = {"numExp": 0, "numRead":[], "biosamples":[], "tissues":[], "biosampleType":{}, "targets":[]}
    
    experimentDict[assayType][assayTermName]["numExp"] += 1
    experimentDict[assayType][assayTermName]["numRead"] += numberReads
    experimentDict[assayType][assayTermName]["targets"] += targets
    
    if "Homo sapiens" in biosample:
        longBiosample = tissue + " " + treatment + " " + donor
    elif "Mus musculus" in biosample:
        longBiosample = biosample
    else:
        longBiosample = biosample
    experimentDict[assayType][assayTermName]["biosamples"].append(longBiosample)
    biosampleList.append(longBiosample)
    
    experimentDict[assayType][assayTermName]["tissues"].append(tissue)
    tissueList.append(tissue)

print("Total # biosamples", "\t", len(list(set(biosampleList))))
print("Total # cell/tissue types" "\t", len(list(set(tissueList))))

for x in experimentDict:
    print(x)
    for y in experimentDict[x]:
        data = experimentDict[x][y]
        sumRead = sum(data["numRead"])
        numTargets = len(list(set(data["targets"])))
        print("\t", y, "\t", data["numExp"], "\t", len(list(set(data["biosamples"]))), "\t", len(list(set(data["tissues"]))), "\t", sumRead, "\t", numTargets)


ENCSR724HQB
ENCSR724HQB
ENCSR917JIA
ENCSR917JIA
Total # biosamples 	 1341
Total # cell/tissue types	 318
DNA binding
	 Mint-ChIP-seq 	 977 	 154 	 62 	 131857097140 	 6
	 Histone ChIP-seq 	 791 	 194 	 53 	 74893183558 	 6
	 TF ChIP-seq 	 875 	 191 	 74 	 92316660531 	 619
CRISPR screen
	 Flow-FISH CRISPR screen 	 280 	 9 	 5 	 4847280143 	 0
	 proliferation CRISPR screen 	 53 	 13 	 11 	 4351285171 	 0
	 FACS CRISPR screen 	 12 	 7 	 2 	 965222911 	 0
Transcription
	 CRISPR RNA-seq 	 336 	 2 	 2 	 81858461506 	 215
	 total RNA-seq 	 452 	 367 	 96 	 39449715780 	 0
	 microRNA-seq 	 197 	 191 	 80 	 5049178324 	 0
	 shRNA RNA-seq 	 23 	 2 	 2 	 2238265452 	 18
	 polyA plus RNA-seq 	 56 	 56 	 11 	 3830639560 	 0
	 Bru-seq 	 49 	 34 	 17 	 13159190800 	 0
	 BruChase-seq 	 32 	 16 	 16 	 7695028044 	 0
	 BruUV-seq 	 16 	 16 	 16 	 4009149608 	 0
	 PAS-seq 	 7 	 7 	 7 	 1424618846 	 0
	 PRO-seq 	 20 	 4 	 3 	 3167532333 	 0
	 PRO-cap 	 25 	 8 	 7 	 4386600070 	 0
	 long read RNA-seq 	 162

### ENCODE4 - Human

In [14]:
## Parameters
species = "Homo+sapiens"
rfa="ENCODE4"

## Build query
urlMain = "https://www.encodeproject.org/search/?type=Experiment&" + \
    "type=FunctionalCharacterizationExperiment&" + \
    "control_type!=*&status=released&award.rfa=" + rfa + \
    "&replicates.library.biosample.donor.organism.scientific_name=" + species + \
    "&format=json&limit=all" #update limit=all for all datasets

request = urllib.request.Request(urlMain)
request.add_header("Authorization", "Basic %s" % creds)
response = urllib.request.urlopen(request)
data = json.loads(response.read())

biosampleList = []
tissueList = []
experimentDict = {}

for entry in data["@graph"]: #loops through experiments
    experiment = entry["accession"]
    assayTermName, assayType, biosample, tissue, donor, treatment, status, numberReads, targets = \
        Extract_Experiment_Metatdata(experiment, creds)
    if assayType not in experimentDict:
        experimentDict[assayType] = {}
    if assayTermName not in experimentDict[assayType]:
        experimentDict[assayType][assayTermName] = {"numExp": 0, "numRead":[], "biosamples":[], "tissues":[], "biosampleType":{}, "targets":[]}
    
    experimentDict[assayType][assayTermName]["numExp"] += 1
    experimentDict[assayType][assayTermName]["numRead"] += numberReads
    experimentDict[assayType][assayTermName]["targets"] += targets
    
    if "Homo sapiens" in biosample:
        longBiosample = tissue + " " + treatment + " " + donor
    elif "Mus musculus" in biosample:
        longBiosample = biosample
    else:
        longBiosample = biosample
    experimentDict[assayType][assayTermName]["biosamples"].append(longBiosample)
    biosampleList.append(longBiosample)
    
    experimentDict[assayType][assayTermName]["tissues"].append(tissue)
    tissueList.append(tissue)

print("Total # biosamples", "\t", len(list(set(biosampleList))))
print("Total # cell/tissue types" "\t", len(list(set(tissueList))))

for x in experimentDict:
    print(x)
    for y in experimentDict[x]:
        data = experimentDict[x][y]
        sumRead = sum(data["numRead"])
        numTargets = len(list(set(data["targets"])))
        print("\t", y, "\t", data["numExp"], "\t", len(list(set(data["biosamples"]))), "\t", len(list(set(data["tissues"]))), "\t", sumRead, "\t", numTargets)

ENCSR724HQB
ENCSR724HQB
ENCSR917JIA
ENCSR917JIA
Total # biosamples 	 1067
Total # cell/tissue types	 293
CRISPR screen
	 Flow-FISH CRISPR screen 	 280 	 9 	 5 	 4847280143 	 0
	 proliferation CRISPR screen 	 53 	 13 	 11 	 4351285171 	 0
	 FACS CRISPR screen 	 2 	 1 	 1 	 44655680 	 0
DNA accessibility
	 ATAC-seq 	 368 	 302 	 183 	 98254548592 	 0
	 DNase-seq 	 713 	 603 	 136 	 146623587892 	 0
DNA binding
	 Mint-ChIP-seq 	 977 	 154 	 62 	 131857097140 	 6
	 Histone ChIP-seq 	 791 	 194 	 53 	 74893183558 	 6
	 TF ChIP-seq 	 875 	 191 	 74 	 92316660531 	 619
Massively parallel reporter assay
	 MPRA 	 27 	 7 	 6 	 16441215941 	 0
	 STARR-seq 	 21 	 17 	 6 	 26997995089 	 0
Transcription
	 total RNA-seq 	 373 	 288 	 91 	 30815536968 	 0
	 CRISPR RNA-seq 	 336 	 2 	 2 	 81858461506 	 215
	 microRNA-seq 	 108 	 103 	 73 	 2662795485 	 0
	 shRNA RNA-seq 	 23 	 2 	 2 	 2238265452 	 18
	 Bru-seq 	 49 	 34 	 17 	 13159190800 	 0
	 BruChase-seq 	 32 	 16 	 16 	 7695028044 	 0
	 BruUV-seq 	

### ENCODE4 - Mouse

In [15]:
## Parameters
species = "Mus+musculus"
rfa="ENCODE4"

## Build query
urlMain = "https://www.encodeproject.org/search/?type=Experiment&" + \
    "type=FunctionalCharacterizationExperiment&" + \
    "control_type!=*&status=released&award.rfa=" + rfa + \
    "&replicates.library.biosample.donor.organism.scientific_name=" + species + \
    "&format=json&limit=all" #update limit=all for all datasets

request = urllib.request.Request(urlMain)
request.add_header("Authorization", "Basic %s" % creds)
response = urllib.request.urlopen(request)
data = json.loads(response.read())

biosampleList = []
tissueList = []
experimentDict = {}

for entry in data["@graph"]: #loops through experiments
    experiment = entry["accession"]
    assayTermName, assayType, biosample, tissue, donor, treatment, status, numberReads, targets = \
        Extract_Experiment_Metatdata(experiment, creds)
    if assayType not in experimentDict:
        experimentDict[assayType] = {}
    if assayTermName not in experimentDict[assayType]:
        experimentDict[assayType][assayTermName] = {"numExp": 0, "numRead":[], "biosamples":[], "tissues":[], "biosampleType":{}, "targets":[]}
    
    experimentDict[assayType][assayTermName]["numExp"] += 1
    experimentDict[assayType][assayTermName]["numRead"] += numberReads
    experimentDict[assayType][assayTermName]["targets"] += targets
    
    if "Homo sapiens" in biosample:
        longBiosample = tissue + " " + treatment + " " + donor
    elif "Mus musculus" in biosample:
        longBiosample = biosample
    else:
        longBiosample = biosample
    experimentDict[assayType][assayTermName]["biosamples"].append(longBiosample)
    biosampleList.append(longBiosample)
    
    experimentDict[assayType][assayTermName]["tissues"].append(tissue)
    tissueList.append(tissue)

print("Total # biosamples", "\t", len(list(set(biosampleList))))
print("Total # cell/tissue types" "\t", len(list(set(tissueList))))

for x in experimentDict:
    print(x)
    for y in experimentDict[x]:
        data = experimentDict[x][y]
        sumRead = sum(data["numRead"])
        numTargets = len(list(set(data["targets"])))
        print("\t", y, "\t", data["numExp"], "\t", len(list(set(data["biosamples"]))), "\t", len(list(set(data["tissues"]))), "\t", sumRead, "\t", numTargets)

Total # biosamples 	 274
Total # cell/tissue types	 32
CRISPR screen
	 FACS CRISPR screen 	 10 	 6 	 1 	 920567231 	 0
Transcription
	 microRNA-seq 	 89 	 88 	 8 	 2386382839 	 0
	 total RNA-seq 	 79 	 79 	 6 	 8634178812 	 0
	 PAS-seq 	 3 	 3 	 3 	 375502866 	 0
	 long read RNA-seq 	 64 	 60 	 9 	 167295553 	 0
	 polyA plus RNA-seq 	 2 	 2 	 2 	 526484832 	 0
DNA accessibility
	 DNase-seq 	 177 	 176 	 17 	 76708564334 	 0
Single cell
	 snATAC-seq 	 43 	 13 	 8 	 37715581653 	 0
	 scRNA-seq 	 273 	 105 	 12 	 38875154074 	 0
	 long read scRNA-seq 	 3 	 3 	 2 	 2277781 	 0
3D chromatin structure
	 ChIA-PET 	 11 	 6 	 3 	 14802851418 	 2
	 Hi-C 	 16 	 15 	 5 	 44700866460 	 0


In [None]:
#urlMain = "https://www.encodeproject.org/search/?type=Experiment&" + \
#    "type=FunctionalCharacterizationExperiment&" + \
#    "control_type!=*&status=released&perturbed=true&perturbed=false" + \
#    "&award.rfa=ENCODE4&award.rfa=ENCODE3&award.rfa=ENCODE2&award.rfa=ENCODE2-Mouse" + \
#    "&replicates.library.biosample.donor.organism.scientific_name=" + species1 + \
#    "&format=json&limit=all" #update limit=all for all datasets

## Previous ENCODE Versions