<a href="https://colab.research.google.com/github/Moore-Lab-UMass/ENCODE-API/blob/main/Retrieve_K562_TF_Peaks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Jill E. Moore
#Moore Lab - UMass Chan Medical School
#May 2022

import os, json, urllib.request, urllib.parse, urllib.error, requests

def Download_File(name, extension, outputDir): #downloads file to local machine
    if not os.path.exists(outputDir+"/"+name+"."+extension):
        url="https://www.encodeproject.org/files/"+name+"/@@download/"+name+"."+extension
        r = requests.get(url)
        outputFile=open(outputDir+"/"+name+"."+extension, "wb")
        outputFile.write(r.content)
        outputFile.close()

def Retrieve_QC_Metrics(peaks): #retrieves qc metrics about peak file
  urlPeak = "https://www.encodeproject.org/files/"+peaks+"/?format=json"
  response = urllib.request.urlopen(urlPeak)
  data = json.loads(response.read())

  frip = "NA"
  numPeaks = "NA"

  for qc in data["quality_metrics"]:
    if "frip" in qc:
      frip = qc["frip"]
    if "reproducible_peaks" in qc:
      numPeaks = qc["reproducible_peaks"]
  return frip, numPeaks

def Extract_Experiment_Metatdata(exp, genome): #extracts experiment metadata
  urlExp = "https://www.encodeproject.org/experiments/"+exp+"/?format=json"
  response = urllib.request.urlopen(urlExp)
  data = json.loads(response.read())

  rfa = data["award"]["rfa"] #project/phase of encode
  tf = data["target"]["label"] #target of the assay, i.e. TF
  lab = data["lab"]["title"] #lab that submitted the experiment
  biosample = data["biosample_ontology"]["term_name"] #short name of biosample

  peaks = "NA"
  frip = "NA"
  numPeaks = "NA"

  for entry in data["files"]: #loops through files associated with experiment
    if entry["file_type"] == "bed narrowPeak" and entry["status"] == "released" \
      and entry["assembly"] == genome and "preferred_default" in entry \
      and entry["preferred_default"] == True:

      peaks = entry["accession"] #peak file

  if peaks != "NA":
    #Download_File(peaks, "bed.gz", "/data/output")
    frip, numPeaks = Retrieve_QC_Metrics(peaks) #extract qc metrics
  return biosample, peaks, tf, rfa, lab, frip, numPeaks

## Parameters
genome = "GRCh38"
species = "Homo+sapiens"
cellType = "K562"

## Build query
urlMain = "https://www.encodeproject.org/search/?type=Experiment&status=released" + \
    "&perturbed=false&assay_title=TF+ChIP-seq" + \
    "&replicates.library.biosample.donor.organism.scientific_name=" + species + \
    "&biosample_ontology.term_name=" + cellType + "&format=json&limit=10" #update limit=all for all datasets

response = urllib.request.urlopen(urlMain)
data = json.loads(response.read())

print("experiment_accession" + "\t" + "peak_accession" + "\t" + "tf" + "\t" + "biosample" + \
        "\t" + "rfa" +"\t"+ "lab" + "\t" + "frip" + "\t" + "number_peaks")

for entry in data["@graph"]: #loops through experiments
  biosample, peaks, tf, rfa, lab, frip, numPeaks = \
    Extract_Experiment_Metatdata(entry["accession"], genome)
  print(entry["accession"] + "\t" + peaks + "\t" + tf + "\t" + biosample + \
        "\t" + rfa +"\t"+ lab + "\t" + str(round(frip,4)) + "\t" + str(numPeaks))

experiment_accession	peak_accession	tf	biosample	rfa	lab	frip	number_peaks
ENCSR208VNN	ENCFF518SJY	IFI16	K562	ENCODE4	Michael Snyder, Stanford	0.0018	1568
ENCSR579XWM	ENCFF023JGT	HSF4	K562	ENCODE4	Michael Snyder, Stanford	0.0038	2996
ENCSR659CCI	ENCFF516ZWP	FOXK1	K562	ENCODE4	Richard Myers, HAIB	0.0107	7647
ENCSR228ELU	ENCFF214SNH	TEAD1	K562	ENCODE4	Richard Myers, HAIB	0.0362	18156
ENCSR939CDD	ENCFF086FAZ	ARID4B	K562	ENCODE4	Richard Myers, HAIB	0.0445	18831
ENCSR530WIV	ENCFF491EEI	FOXP1	K562	ENCODE4	Richard Myers, HAIB	0.0237	12557
ENCSR946WBN	ENCFF865UPM	JUN	K562	ENCODE4	Richard Myers, HAIB	0.0108	6966
ENCSR788DXU	ENCFF197SXI	FOXA3	K562	ENCODE4	Richard Myers, HAIB	0.0208	12382
ENCSR817FMN	ENCFF303CVC	TEAD4	K562	ENCODE4	Richard Myers, HAIB	0.0052	3856
ENCSR849LXI	ENCFF478MAJ	MTF1	K562	ENCODE4	Michael Snyder, Stanford	0.0018	1840
