In [1]:
import json
from bs4 import BeautifulSoup
import requests
import csv
# import lxml

# JSON Parser
Run this code to generate a csv file that contains information about bigWig files from selected experiments on <a href='https://www.encodeproject.org'>encodeproject.org</a>

In [3]:
# csv file header
header = ['Biosample Summary', 'Assay', 'Target', 'BigWig File ID', 'Output Type','Isogenic Replicate', 'Download Link']

# website to be scraped
url = 'https://www.encodeproject.org/'

# webpage containing experiments to be scraped
# the 'search... is in the URL once experiments have been selected and in the list format
# all &format=json at the end of the 'search...' URL
experiments = url + 'search/?type=Experiment&control_type!=*&status=released&perturbed=false&assay_title=TF+ChIP-seq&target.investigated_as=transcription+factor&target.investigated_as=chromatin+remodeler&target.investigated_as=cofactor&target.investigated_as=RNA+polymerase+complex&replicates.library.biosample.donor.organism.scientific_name=Homo+sapiens&biosample_ontology.classification=tissue&biosample_ontology.term_name=liver&format=json'

# scrape text from webpage body
# BeautifulSoup (BS4) is the library for scraping webpages
# json puts webpage contents into a json object
soup = BeautifulSoup(requests.get(experiments).text, 'html.parser')
experimental_matrix = json.loads(str(soup))

# write to csv
# the bame bigWigFiles.csv can be changed if multiple webpages are scraped; otherwise results will be overwritten
with open('bigWigFiles_human_liver_TFs.csv','w') as f:
    
    writer = csv.writer(f)
    writer.writerow(header)
    
    # iterate over experiments
    for obj in experimental_matrix['@graph']:
        
        #scrape from experiments webpage
        soup = BeautifulSoup(requests.get(url+obj['@id']+'?format=json').text, 'html.parser')
        experiment_json = json.loads(str(soup))
        
        # grab desired information
        biosample_summary = experiment_json['biosample_summary']
        assay = experiment_json['assay_title']
        
        # DNase-seq and ATAC-seq don't have targets
        if 'target' in experiment_json.keys():
            target = experiment_json['target']['label']
        else:
            target = ''
        
        # filter for bigWig files from ENCODE4
        # This filter may have to change once new version of ENCODE results is released. 
        bigWig_files = list(filter(lambda x: 'File' in x['@type'] and x['status']=='released' and x['file_format']=='bigWig' and "ENCODE4" in x['analyses'][0]['title'], experiment_json['files']))
        
        # add files to csv file
        for file in bigWig_files:
            link = url+file['href']
            output_type = file['output_type']
            replicates = ','.join(file['technical_replicates'])
            fname = file['accession']
            writer.writerow([biosample_summary, assay, target, fname, output_type, replicates, link])

KeyError: '@graph'