In [None]:
%load_ext autoreload
%autoreload 2

# Introduction

## Setup

In [None]:
!jupyter nbextension enable --py widgetsnbextension

In [None]:
import pandas as pd
from urllib import request
from tqdm.notebook import tqdm
from pathlib import Path
import yaml

## Download metadata file from Encode

In [None]:
!wget -O metadata.tsv "https://www.encodeproject.org/metadata/?status=released&internal_tags=ENCORE&assay_title=eCLIP&biosample_ontology.term_name=K562&biosample_ontology.term_name=HepG2&files.file_type=bed+narrowPeak&type=Experiment&files.analyses.status=released&files.preferred_default=true"

In [None]:
metadata = pd.read_csv('metadata.tsv', sep='\t')

In [None]:
metadata

## Prepare directory structure

In [None]:
BASE_FILE_PATH = Path("./csv/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)
    
BASE_FILE_PATH.mkdir()

In [None]:
!rm metadata.tsv

## Transform data

In [None]:
# get all unique protein names - all of our classes
targets = metadata['Experiment target'].unique()
targets

This step takes about 50 minutes on a single core machine.

In [None]:
BED_HEADER = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts']
ALLOWED_CHROMOSOMES = ['chr' + str(i) for i in range(1, 22)] + ['chrX', 'chrY', 'chrMT']

for target in tqdm(targets):

  protein_name = target[: target.index('-')]
  rows = metadata[metadata['Experiment target'] == target]

  # In general, it is not a good practice to create a DataFrame in loop by appending. 
  # But since we know that we have only one or two rows for these targets, we can use it.
  df = pd.DataFrame([], columns=['chr', 'start', 'end', 'strand'])
  for index, row in rows.iterrows():

    local_file = row['File accession'] + '.bed.gz'

    # downloading files in while-try, because network connection might break sometimes
    success = False
    while not success:
      try:
        request.urlretrieve(row['File download URL'], local_file)
        success = True
      except:
        print('Problem with file ', local_file, '. Trying again.')
    reads = pd.read_csv(local_file, sep='\t', compression='gzip', header=None, names=BED_HEADER)
    
    # keep just necessary columns
    reads = reads[['chrom', 'chromStart', 'chromEnd', 'strand']]
    # keep just sequences from chromosomes chr1 - ch22, chrX, chrX and chrMT
    reads = reads[reads['chrom'].isin(ALLOWED_CHROMOSOMES)]
    # removing outliers - keep just sequences >= 20 bp and <= 100 bp
    reads = reads[((reads['chromEnd'] - reads['chromStart']) >= 20) & ((reads['chromEnd'] - reads['chromStart']) <= 100)]
    # rename columns
    reads.columns = ['chr', 'start', 'end', 'strand']

    # add DataFrame from current file to DataFrame from previous file
    df = pd.concat([df, reads], ignore_index=True)
    
    # cleaning - delete downloaded bed file
    Path(local_file).unlink()

  # add column with name of protein
  df['protein_name'] = protein_name

  filename = protein_name + '.csv.gz'
  df.to_csv(BASE_FILE_PATH / filename, index=False, compression='gzip')

## YAML file

In [None]:
# YAML file with metadata
# we store paths of reference fasta files

desc = {
  'version': 0,
  'classes': {}
}
for target in targets:
  name = target[: target.index('-')]
  desc['classes'][name] = {
      'type': 'fa.gz',
      'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
      'extra_processing': 'ENSEMBL_HUMAN_GENOME'
  }

with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:    
    yaml.dump(desc, fw)

desc