In [1]:
import pandas as pd
import yaml
from pathlib import Path
from sklearn.model_selection import train_test_split

## Download data from ECCB2020 workshop

In [2]:
intergenomic_seqs_raw = pd.read_csv("http://raw.githubusercontent.com/ML-Bioinfo-CEITEC/ECCB2020/master/data/intergenomic_seqs_50k.csv")
print(intergenomic_seqs_raw.shape)
intergenomic_seqs_raw.head()

(50000, 4)


Unnamed: 0,chr,start,end,seq
0,17,14239397,14239596,AACTGGGATTCACAGGAGCTTAATGGAGCACATGATGTTAAGTGAA...
1,X,90928374,90928573,GCTAGTTGTATGGTTAGCAGCAAGATATTTTTTCTCTCTGATCTTT...
2,3,104278717,104278916,GACTTTGTAGACTTGTGTGACCTGTGTGCCTCCCTCTCCCCCCAAA...
3,4,187089054,187089253,ATGTTAACACCAAATCAGTCCATCCTAATTATCACTCAAAAATCAA...
4,2,137742849,137743048,GCAGGAGCTCTATCTGTTTGGACTAGTTCAGCCCCATCTCTTTTGG...


In [3]:
coding_seqs_raw = pd.read_csv("https://raw.githubusercontent.com/ML-Bioinfo-CEITEC/ECCB2020/master/data/random_transcripts.csv")
print(coding_seqs_raw.shape)
coding_seqs_raw.head()

(50000, 5)


Unnamed: 0,id,chr,random_start,random_end,seq
0,ENST00000443659,3,294,493,CTTCTGAAGAAAGAAATCGATAAGTTCAAAGACAAAAAAGAGGTAG...
1,ENST00000458477,12,647,846,CTGCACCCACTGCCAGGAGGAGATCGGATCCCGGAACTTCTTCGAG...
2,ENST00000610355,4,1042,1241,GTAGTCAATGTGAACCAATTACATTGGAACTCTGCATGAATTTGCC...
3,ENST00000536187,12,1055,1254,GGAGGTTTACTGGCGGGCCACGCACCACCCAGCCCCTGGCCCCGGA...
4,ENST00000561718,16,109,308,ATGAAGCCGAGAAGGCGCTGAAGCACATGGATGGAGGACAAATTGA...


## Data transform

In [4]:
intergenomic_seqs = pd.DataFrame({'id': list(range(len(intergenomic_seqs_raw))),
                                  'region': 'chr' + intergenomic_seqs_raw['chr'].astype(str),
                                  'start': intergenomic_seqs_raw['start']-1,
                                  'end': intergenomic_seqs_raw['end'],
                                  'strand': '+' 
                                 })
intergenomic_seqs

Unnamed: 0,id,region,start,end,strand
0,0,chr17,14239396,14239596,+
1,1,chrX,90928373,90928573,+
2,2,chr3,104278716,104278916,+
3,3,chr4,187089053,187089253,+
4,4,chr2,137742848,137743048,+
...,...,...,...,...,...
49995,49995,chr4,44124986,44125186,+
49996,49996,chr1,233442677,233442877,+
49997,49997,chrX,30807631,30807831,+
49998,49998,chr20,53905546,53905746,+


In [5]:
coding_seqs = pd.DataFrame({'id': list(range(len(coding_seqs_raw))),
                                  'region': coding_seqs_raw['id'].astype(str),
                                  'start': coding_seqs_raw['random_start'],
                                  'end': coding_seqs_raw['random_end']+1,
                                  'strand': '+' 
                                 })
coding_seqs

Unnamed: 0,id,region,start,end,strand
0,0,ENST00000443659,294,494,+
1,1,ENST00000458477,647,847,+
2,2,ENST00000610355,1042,1242,+
3,3,ENST00000536187,1055,1255,+
4,4,ENST00000561718,109,309,+
...,...,...,...,...,...
49995,49995,ENST00000392833,939,1139,+
49996,49996,ENST00000342970,182,382,+
49997,49997,ENST00000575314,613,813,+
49998,49998,ENST00000557658,863,1063,+


## Train/test split

In [6]:
train_coding_seqs, test_coding_seqs = train_test_split(coding_seqs, shuffle=True, random_state=42)
train_coding_seqs.shape, test_coding_seqs.shape

((37500, 5), (12500, 5))

In [7]:
train_intergenomic_seqs, test_intergenomic_seqs = train_test_split(intergenomic_seqs, shuffle=True, random_state=42)
train_intergenomic_seqs.shape, test_intergenomic_seqs.shape

((37500, 5), (12500, 5))

## YAML metadata

In [8]:
BASE_FILE_PATH = Path("../../datasets/demo_coding_vs_intergenomic_seqs/")

# copied from https://stackoverflow.com/a/57892171
def rm_tree(pth: Path):
    for child in pth.iterdir():
        if child.is_file():
            child.unlink()
        else:
            rm_tree(child)
    pth.rmdir()

if BASE_FILE_PATH.exists():
    rm_tree(BASE_FILE_PATH)
    
BASE_FILE_PATH.mkdir()
(BASE_FILE_PATH / 'train').mkdir()
(BASE_FILE_PATH / 'test').mkdir()

In [9]:
with open(BASE_FILE_PATH / 'metadata.yaml', 'w') as fw:
    desc = {
        'version': 0,
        'classes': {
            'intergenomic_seqs': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_GENOME' 
            },    
            'coding_seqs': {
                'type': 'fa.gz',
                'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz',
                'extra_processing': 'ENSEMBL_HUMAN_TRANSCRIPTOME'
            }
        }
    }
    
    yaml.dump(desc, fw)

desc

{'version': 0,
 'classes': {'intergenomic_seqs': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_GENOME'},
  'coding_seqs': {'type': 'fa.gz',
   'url': 'http://ftp.ensembl.org/pub/release-97/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz',
   'extra_processing': 'ENSEMBL_HUMAN_TRANSCRIPTOME'}}}

## CSV files

In [10]:
train_intergenomic_seqs.to_csv(BASE_FILE_PATH / 'train' / 'intergenomic_seqs.csv.gz', index=False, compression='gzip')
train_coding_seqs.to_csv(BASE_FILE_PATH / 'train' / 'coding_seqs.csv.gz', index=False, compression='gzip')
test_intergenomic_seqs.to_csv(BASE_FILE_PATH / 'test' / 'intergenomic_seqs.csv.gz', index=False, compression='gzip')
test_coding_seqs.to_csv(BASE_FILE_PATH / 'test' / 'coding_seqs.csv.gz', index=False, compression='gzip')