In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from genomic_benchmarks.loc2seq.with_biopython import _fastagz2dict
from genomic_benchmarks.seq2loc import fasta2loc
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
from pathlib import Path
import yaml
import tarfile

## Load genomic reference and download data from GitHub

In [3]:
genome = _fastagz2dict(Path.home() / ".genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz",
                      24, 'MT')
genome.keys()

  0%|          | 0/24 [00:00<?, ?it/s]

dict_keys(['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', 'X', 'Y', 'MT'])

In [4]:
!wget http://www.cs.huji.ac.il/~tommy//enhancer_CNN/Enhancers_vs_negative.tgz

--2021-10-20 00:43:11--  http://www.cs.huji.ac.il/~tommy//enhancer_CNN/Enhancers_vs_negative.tgz
Resolving www.cs.huji.ac.il (www.cs.huji.ac.il)... 132.65.118.16
Connecting to www.cs.huji.ac.il (www.cs.huji.ac.il)|132.65.118.16|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://www.cs.huji.ac.il/~tommy/enhancer_CNN/Enhancers_vs_negative.tgz [following]
--2021-10-20 00:43:12--  https://www.cs.huji.ac.il/~tommy/enhancer_CNN/Enhancers_vs_negative.tgz
Connecting to www.cs.huji.ac.il (www.cs.huji.ac.il)|132.65.118.16|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 135457662 (129M) [application/x-gzip]
Saving to: ‘Enhancers_vs_negative.tgz’


2021-10-20 00:43:22 (13.9 MB/s) - ‘Enhancers_vs_negative.tgz’ saved [135457662/135457662]



## Create FASTA

In [5]:
# extract human data

with tarfile.open("Enhancers_vs_negative.tgz", "r:gz") as tar:
    for item in tar.getmembers():
        if item.name in ["Human/positive_samples", "Human/negative_samples"]:
            tar.extract(item, ".")

In [6]:
with open("Human/positive_samples") as fr:
    positives = fr.read().splitlines()
with open("Human/negative_samples") as fr:
    negatives = fr.read().splitlines()

with open("all_together.fa", "w") as fw:
    for i, p in enumerate(positives):
        fw.write(f">positive{i}\n{p}\n")
    for i, n in enumerate(negatives):
        fw.write(f">negative{i}\n{n}\n")

## Get locations

In [7]:
# slow!
locs = fasta2loc("./all_together.fa", genome)

28000 sequences read and parsed.


  0%|          | 0/25 [00:00<?, ?it/s]

  0%|          | 0/248956422 [00:00<?, ?it/s]

  0%|          | 0/242193529 [00:00<?, ?it/s]

  0%|          | 0/198295559 [00:00<?, ?it/s]

  0%|          | 0/190214555 [00:00<?, ?it/s]

  0%|          | 0/181538259 [00:00<?, ?it/s]

  0%|          | 0/170805979 [00:00<?, ?it/s]

  0%|          | 0/159345973 [00:00<?, ?it/s]

  0%|          | 0/145138636 [00:00<?, ?it/s]

  0%|          | 0/138394717 [00:00<?, ?it/s]

  0%|          | 0/133797422 [00:00<?, ?it/s]

  0%|          | 0/135086622 [00:00<?, ?it/s]

  0%|          | 0/133275309 [00:00<?, ?it/s]

  0%|          | 0/114364328 [00:00<?, ?it/s]

  0%|          | 0/107043718 [00:00<?, ?it/s]

  0%|          | 0/101991189 [00:00<?, ?it/s]

  0%|          | 0/90338345 [00:00<?, ?it/s]

  0%|          | 0/83257441 [00:00<?, ?it/s]

  0%|          | 0/80373285 [00:00<?, ?it/s]

  0%|          | 0/58617616 [00:00<?, ?it/s]

  0%|          | 0/64444167 [00:00<?, ?it/s]

  0%|          | 0/46709983 [00:00<?, ?it/s]

  0%|          | 0/50818468 [00:00<?, ?it/s]

  0%|          | 0/156040895 [00:00<?, ?it/s]

  0%|          | 0/57227415 [00:00<?, ?it/s]

  0%|          | 0/16569 [00:00<?, ?it/s]

27791 sequences found in the reference.


### A few checks

In [8]:
len(locs.keys())

27791

In [9]:
from Bio.Seq import Seq

def _rev(seq, strand):
    # reverse complement
    if strand == '-':
        return str(Seq(seq).reverse_complement())
    else:
        return seq

In [10]:
_rev(genome['1'][959200:959451], "-")

'GCATCTGGGCCCCACCGGGGCTGCCCGCACCGAGCACGCGAACGCGCCCTCCCGCCCTGAGGCCGCCGGCGTTGCGGTCGGAGAACCATAGAGCCACTCGGCTGGGCGTGGCGCGGCGGGGCGGGGAAAGGGGCGGGGCCTGGGCGGCGGAAGTGCGCAGCCGCGCGGCATTCTGGGGCCGGAAGTGGGGTGCACGCTTCGGGTTGGTGTCATGGCAGCTGCGGGGAGCCGCAAGAGGTAAGCCGCGGGTC'

In [11]:
locs_df = pd.DataFrame.from_dict(locs, orient='index', columns=['region','start','end','strand']).rename_axis('id')
positives_df = locs_df[locs_df.index.str.contains("positive")]
negatives_df = locs_df[locs_df.index.str.contains("negative")]
positives_df.shape, negatives_df.shape

((13895, 4), (13896, 4))

In [12]:
positives_df['region'] = "chr" + positives_df['region']
negatives_df['region'] = "chr" + negatives_df['region']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positives_df['region'] = "chr" + positives_df['region']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  negatives_df['region'] = "chr" + negatives_df['region']


## Train/test split

In [13]:
train_positives, test_positives = train_test_split(positives_df, shuffle=True, random_state=42)
train_positives.shape, test_positives.shape

((10421, 4), (3474, 4))

In [14]:
train_negatives, test_negatives = train_test_split(negatives_df, shuffle=True, random_state=42)
train_negatives.shape, test_negatives.shape

((10422, 4), (3474, 4))

## YAML file

In [15]:
BASE_FILE_PATH = Path("../../datasets/human_enhancers_cohn/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)
    
BASE_FILE_PATH.mkdir()
(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [16]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'positive': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            },    
            'negative': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'positive': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'},
  'negative': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'}}}

## CSV files

In [17]:
train_positives.to_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz', index=False, compression='gzip')
train_negatives.to_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz', index=False, compression='gzip')
test_positives.to_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz', index=False, compression='gzip')
test_negatives.to_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz', index=False, compression='gzip')

In [17]:
# hotfix - adding ids

BASE_FILE_PATH = Path("../../datasets/human_enhancers_cohn/")

def add_ids(df, prefix):
    ids = prefix + df.index.map(str)
    df.insert(loc=0, column='id', value=ids)

train_positives = pd.read_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz')
add_ids(train_positives, "train_positive_")
train_positives.to_csv(BASE_FILE_PATH / 'train' / 'positive.csv.gz', index=False, compression='gzip')

train_negatives = pd.read_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz')
add_ids(train_negatives, "train_negative_")
train_negatives.to_csv(BASE_FILE_PATH / 'train' / 'negative.csv.gz', index=False, compression='gzip')

test_positives = pd.read_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz')
add_ids(test_positives, "test_positive_")
test_positives.to_csv(BASE_FILE_PATH / 'test' / 'positive.csv.gz', index=False, compression='gzip')

test_negatives = pd.read_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz')
add_ids(test_negatives, "test_negative_")
test_negatives.to_csv(BASE_FILE_PATH / 'test' / 'negative.csv.gz', index=False, compression='gzip')

## Cleaning

In [18]:
!rm all_together.fa Enhancers_vs_negative.tgz
!rm -rf Human

## Testing

In [18]:
from genomic_benchmarks.loc2seq import download_dataset

download_dataset("human_enhancers_cohn")

Reference /home/petr/.genomic_benchmarks/fasta/Homo_sapiens.GRCh38.dna.toplevel.fa.gz already exists. Skipping.




  0%|          | 0/24 [00:00<?, ?it/s]

PosixPath('/home/petr/.genomic_benchmarks/human_enhancers_cohn')

In [19]:
from genomic_benchmarks.data_check import info

info("human_enhancers_cohn", 0)

Dataset `human_enhancers_cohn` has 2 classes: negative, positive.

All lenghts of genomic intervals equals 500.

Totally 27791 sequences have been found, 20843 for training and 6948 for testing.


Unnamed: 0,train,test
negative,10422,3474
positive,10421,3474
