# Prepare environment

In [1]:
!pip install git+https://github.com/katarinagresova/ensembl_scraper.git

Collecting git+https://github.com/katarinagresova/ensembl_scraper.git
  Cloning https://github.com/katarinagresova/ensembl_scraper.git to /tmp/pip-req-build-h9vs0asx
  Running command git clone --filter=blob:none -q https://github.com/katarinagresova/ensembl_scraper.git /tmp/pip-req-build-h9vs0asx
  Resolved https://github.com/katarinagresova/ensembl_scraper.git to commit 6d3bba8e6be7f5ead58a3bbaed6a4e8cd35e62fd
  Preparing metadata (setup.py) ... [?25ldone
You should consider upgrading via the '/home/katarina/git/genomic_benchmarks/venv/bin/python -m pip install --upgrade pip' command.[0m


# Create config file

In [2]:
import yaml

config = {
    "root_dir": "../../datasets/",
    "organisms": {
        "homo_sapiens": {
            "regulatory_feature"
        }
    }
}

user_config = 'user_config.yaml'
with open(user_config, 'w') as handle:
  yaml.dump(config, handle)

# Prepare directories

In [3]:
from pathlib import Path

BASE_FILE_PATH = Path("../../datasets/human_enhancers_ensembl_regulatory/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)

# Run tool

In [4]:
!python -m scraper.ensembl_scraper -c user_config.yaml

Processing organisms:   0%|                               | 0/1 [00:00<?, ?it/s]
Processing feature files:   0%|                           | 0/1 [00:00<?, ?it/s][AINFO:root:download_file(): Going to download file from path ftp://ftp.ensembl.org/pub/release-100/mysql/regulation_mart_100/hsapiens_regulatory_feature__regulatory_feature__main.txt.gz
INFO:root:download_file(): File downloaded to path ../../datasets//tmp//homo_sapiens_regulatory_feature.txt.gz.
INFO:root:parse_feature_file(): Going to parse file ../../datasets//tmp//homo_sapiens_regulatory_feature.txt.gz
INFO:root:parse_feature_file(): Done parsing file ../../datasets//tmp//homo_sapiens_regulatory_feature.txt.gz


Processing feature types:   0%|                           | 0/6 [00:00<?, ?it/s][A[AINFO:root:find_sequences(): Going to find sequences based on genomic loci.
INFO:root:download_2bit_file(): Going to download 2bit file hg38
INFO:root:download_2bit_file(): File for hg38 downloaded to path ../../datasets//tmp/hg38

# Cleaning

In [4]:
!mv ../../datasets/homo_sapiens_regulatory_feature_enhancer ../../datasets/human_enhancers_ensembl_regulatory/

In [5]:
!rm user_config.yaml

In [8]:
!rm -rf ../../datasets/tmp/

In [12]:
!rm -rf ../../datasets/homo_sapiens_regulatory_feature_CTCF_binding_site
!rm -rf ../../datasets/homo_sapiens_regulatory_feature_open_chromatin_region
!rm -rf ../../datasets/homo_sapiens_regulatory_feature_promoter
!rm -rf ../../datasets/homo_sapiens_regulatory_feature_promoter_flanking_region
!rm -rf ../../datasets/homo_sapiens_regulatory_feature_TF_binding_site

# Final reformating

  * gzip all CSV files
  * add extra formatting to yaml config file

In [6]:
!find ../../datasets/human_enhancers_ensembl_regulatory/ -type f -name "*.csv" -exec gzip {} \;

In [7]:
with open("../../datasets/human_enhancers_ensembl_regulatory/metadata.yaml", "r") as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

config

{'classes': {'negative': {'type': 'fa.gz',
   'url': 'ftp://ftp.ensembl.org/pub/release-100/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz'},
  'positive': {'type': 'fa.gz',
   'url': 'ftp://ftp.ensembl.org/pub/release-100/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz'}},
 'version': 0}

In [8]:
config['classes']['positive']['extra_processing'] = 'ENSEMBL_HUMAN_GENOME' 
config['classes']['negative']['extra_processing'] = 'ENSEMBL_HUMAN_GENOME' 

config

{'classes': {'negative': {'type': 'fa.gz',
   'url': 'ftp://ftp.ensembl.org/pub/release-100/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'},
  'positive': {'type': 'fa.gz',
   'url': 'ftp://ftp.ensembl.org/pub/release-100/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'}},
 'version': 0}

In [9]:
with open("../../datasets/human_enhancers_ensembl_regulatory/metadata.yaml", 'w') as handle:
  yaml.dump(config, handle)

# Testing

In [12]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset("human_enhancers_ensembl_regulatory")

Reference /home/katarina/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.


100%|██████████| 24/24 [00:30<00:00,  1.28s/it]


PosixPath('/home/katarina/.genomic_benchmarks/human_enhancers_ensembl_regulatory')

In [14]:
from genomic_benchmarks.data_check import info

info("human_enhancers_ensembl_regulatory", 0)

Dataset `human_enhancers_ensembl_regulatory` has 2 classes: negative, positive.

The length of genomic intervals ranges from 200 to 802, with average 394.7244176255964 and median 400.0.

Totally 213780 sequences have been found, 171024 for training and 42756 for testing.


Unnamed: 0,train,test
negative,85512,21378
positive,85512,21378
