In [1]:
#Jill E. Moore
#Moore Lab - UMass Chan Medical School
#May 2022

import os
import json
import urllib
from urllib import request, parse, error
import requests
import base64
from statistics import mean, median

In [2]:
def Process_Token():
    credentials=open("/home/moorej3/.encode.txt")
    credArray=next(credentials).rstrip().split("\t")
    return credArray[0], credArray[1]

In [3]:
def Extract_Experiment_Metatdata(dataset, creds, genome): #extracts experiment metadata
    try:
        dataDir="/data/projects/encode/json/exps/"+dataset
        json_data=open(dataDir+".json").read()
        data = json.loads(json_data)
    except:
        url = "https://www.encodeproject.org/"+dataset+"/?format=json"
        request = urllib.request.Request(url)
        request.add_header("Authorization", "Basic %s" % creds)
        response = urllib.request.urlopen(request)
        data = json.loads(response.read())

    #assayTermName = data["assay_term_name"]
    assayTermName = data["assay_title"]
    assayType = data["assay_slims"][0]
    
    try:
        biosample = data["biosample_summary"].replace("nuclear fraction ","")
    except:
        biosample = "NA"
        
    try:
        tissue = data["biosample_ontology"]["term_name"]
    except:
        
        url = "https://www.encodeproject.org/"+dataset+"/?format=json"
        request = urllib.request.Request(url)
        request.add_header("Authorization", "Basic %s" % creds)
        response = urllib.request.urlopen(request)
        data = json.loads(response.read())
        
        tissue = data["biosample_ontology"]["term_name"]
    
    donors = []
    treatments = []
    for entry in data["replicates"]:
        donors.append(entry["library"]["biosample"]["donor"]["accession"])
        try:
            for t in entry["library"]["biosample"]["treatments"]:
                treatments.append(t["treatment_term_name"])
        except:
            pass 
    donor = ",".join(list(set(donors)))
    treatment = ",".join(list(set(treatments)))
    
    status=data["status"]
    
    targets = []
    if "target" in data:
        targets.append(data["target"]["label"])
    
    numberReads = []
    
    for entry in data["files"]: #loops through files associated with experiment
        try:
            if entry["file_type"] == "fastq" and entry["status"] == "released":

                replicate = ",".join([str(i) for i in entry["biological_replicates"]])
                numReads = entry["read_count"]
                numberReads.append(numReads)
        except:
            print(dataset)
    #numberReads = list(set(numberReads))
    return assayTermName, assayType, biosample, tissue, donor, treatment, status, numberReads, targets

In [4]:
usrname, psswd = Process_Token()
base64string = base64.b64encode(bytes('%s:%s' % (usrname,psswd),'ascii'))
creds = base64string.decode('utf-8')



In [None]:
## Parameters
genome = "GRCh38"
species1 = "Homo+sapiens"
species2 = "Mus+musculus"

#genome = "mm10"
#species = "Mus+musculus"

## Build query
#urlMain = "https://www.encodeproject.org/search/?type=Experiment&" + \
#    "control_type!=*&status=released&perturbed=true&perturbed=false&award.rfa=ENCODE4&" +\
#    "replicates.library.biosample.donor.organism.scientific_name=" + species + \
#    "&format=json&limit=all" #update limit=all for all datasets

urlMain = "https://www.encodeproject.org/search/?type=Experiment&" + \
    "type=FunctionalCharacterizationExperiment&" + \
    "control_type!=*&status=released&perturbed=true&perturbed=false&award.rfa=ENCODE2&" +\
    "replicates.library.biosample.donor.organism.scientific_name=" + species1 + \
    "&replicates.library.biosample.donor.organism.scientific_name=" + species2 + \
    "&format=json&limit=all" #update limit=all for all datasets

request = urllib.request.Request(urlMain)
request.add_header("Authorization", "Basic %s" % creds)
response = urllib.request.urlopen(request)
data = json.loads(response.read())

biosampleList = []
tissueList = []
experimentDict = {}

for entry in data["@graph"]: #loops through experiments
    experiment = entry["accession"]
    assayTermName, assayType, biosample, tissue, donor, treatment, status, numberReads, targets = \
        Extract_Experiment_Metatdata(experiment, creds, genome)
    if assayType not in experimentDict:
        experimentDict[assayType] = {}
    if assayTermName not in experimentDict[assayType]:
        experimentDict[assayType][assayTermName] = {"numExp": 0, "numRead":[], "biosamples":[], "tissues":[], "biosampleType":{}, "targets":[]}
    
    experimentDict[assayType][assayTermName]["numExp"] += 1
    experimentDict[assayType][assayTermName]["numRead"] += numberReads
    experimentDict[assayType][assayTermName]["targets"] += targets
    
    longBiosample = tissue + " " + treatment + " " + donor
    experimentDict[assayType][assayTermName]["biosamples"].append(longBiosample)
    biosampleList.append(longBiosample)
    
    experimentDict[assayType][assayTermName]["tissues"].append(tissue)
    tissueList.append(tissue)

print("Total # biosamples", "\t", len(list(set(biosampleList))))
print("Total # cell/tissue types" "\t", len(list(set(tissueList))))

for x in experimentDict:
    print(x)
    for y in experimentDict[x]:
        data = experimentDict[x][y]
        sumRead = sum(data["numRead"])
        numTargets = len(list(set(data["targets"])))
        print("\t", y, "\t", data["numExp"], "\t", len(list(set(data["biosamples"]))), "\t", len(list(set(data["tissues"]))), "\t", sumRead, "\t", numTargets)
    
    
    

ENCSR000APW
