In [1]:
import sys
sys.path.insert(0, "..")

### RBP24 dataset preprocessing

This dataset originated in GraphProt paper (https://genomebiology.biomedcentral.com/articles/10.1186/gb-2014-15-1-r17?optIn=true). It consists of 24 proteins. For each protein, four files are present: train.positives, train.negatives, ls.positives, ls.negatives. This notebook processes this dataset in dynamic manner. 

- Distinct folders are created for each split (eg. negatives_ls)
- Headers of files are extracted, chromosome coordinates are adjusted for uniform length -> results in .bed files
- .bed files are mapped to the reference genome and saved as .tsv files to coresponding folders
- Each .tsv file consists of tab delemited columns: chromosome + coordinates and sequence
- Secondary structure notation and conservation scores can be added to the .tsv files


In [1]:
from pathlib import Path
import pandas as pd
from data_processing.process_rbp24 import *

In [2]:
raw_data_path = Path('../Data/rbp24/raw') #path to raw RBP24 fasta files
raw_file_path = Path('../Data/rbp24/raw/ALKBH5_Baltz2012.ls.negatives.fa') #path to raw RBP24 fasta file
processed_data_path = Path('../Data/rbp24/processed') #path to output folder
ref_fasta_path = '../Data/reference/referecnce_genome/hg19_latest.fa' #path to reference genome
conservation_arrays_path = Path('../Data/reference/conservation/arrays') #path conservation scores np.arrays (have to create one first)
window_size = 128 #sequence length (coordinate window of .bed files)

In [3]:
#Be aware that secondary structure computation and conservation scores mapping might take a while.
#Note that if you want to map conservation scores, conservation arrays have to be created first --> run ../src/data_processing/process_wigfix.py
#When folder is passed as a raw_data_path, fuction iterates over all files in folder, and processes .fa files
#When file is passed, function simply process that file

rbp24_preprocessing(raw_data_path=raw_data_path,
                    processed_data_path=processed_data_path, 
                    ref_fasta_path=ref_fasta_path,
                    conservation_arrays_path=conservation_arrays_path,
                    window_size=window_size,
                    compute_secondary_structure=False,
                    map_conservation_scores=False)

Number of files to process: 92
...
Processing file: ALKBH5_Baltz2012.ls.negatives.fa.gz
    Creating bed file...
    Mapping to reference fasta...
    Saving processed protein to: ../Data/rbp24/processed/negatives_ls/ALKBH5_Baltz2012.tsv
File number 1 processed
...
Processing file: ALKBH5_Baltz2012.ls.positives.fa.gz
    Creating bed file...
    Mapping to reference fasta...
    Saving processed protein to: ../Data/rbp24/processed/positives_ls/ALKBH5_Baltz2012.tsv
File number 2 processed
...
Processing file: ALKBH5_Baltz2012.train.negatives.fa.gz
    Creating bed file...
    Mapping to reference fasta...
    Saving processed protein to: ../Data/rbp24/processed/negatives_train/ALKBH5_Baltz2012.tsv
File number 3 processed
...
Processing file: ALKBH5_Baltz2012.train.positives.fa.gz
    Creating bed file...
    Mapping to reference fasta...
    Saving processed protein to: ../Data/rbp24/processed/positives_train/ALKBH5_Baltz2012.tsv
File number 4 processed
...
Processing file: C17ORF85_Bal

In [4]:
names = ["chromosome", "sequence", "label"] #add secondary structure and/or conservation scores if added to the data

rbp_protein = pd.read_csv("../Data/rbp24/processed/negatives_ls/ALKBH5_Baltz2012.tsv",  delimiter="\t", header=None, names=names)
rbp_protein["sequence"] = rbp_protein.sequence.apply(lambda x: x.upper())
print(f"Sequence length: {len(rbp_protein.sequence[0])}")
rbp_protein.head()

Sequence length: 128


Unnamed: 0,chromosome,sequence,label
0,chr1:25553862-25553990(),TTTATGTACATGTATACACACACACACACACAAATACATCCACATC...,0
1,chr1:36359687-36359815(),GAGGTGCTGGACATCAGGAACATAGATGAGCAGCCCAAGCCCCTCA...,0
2,chr1:36699097-36699225(),GAGTGAGACTCCATCTCAAAAAAAAAAAAAAGTTGATTTAAGGCTG...,0
3,chr1:38385555-38385683(),GTGAGCCAAGATCATGCCACTGCACTCCAGCCTGGGCAACACAGCA...,0
4,chr1:55650559-55650687(),AAATATAATATAAAACTTTTGGGTACAGCAAAAGCAGTGATTAAAG...,0


### PRISMNET dataset preprocessing

This dataset originated in PRISMNET paper (https://www.nature.com/articles/s41422-021-00476-y). It consists of 173 processed proteins. Each protein is stored as .tsv file. Protein files are splitted to train/ls and positives/negatives and stored into folders like in rbp24 processing. Here raw sequences are used, since preprocessing was performend when creating the dataset (see the link above). Conservation score mapping is not possible since I dont have information about reference genome version.

In [6]:
from data_processing.process_prism import *
from data_processing.utils import *

In [7]:
in_dir_path = Path("../Data/prisment/raw_tsv")
out_dir_path = Path("../Data/prisment/processed")

In [8]:
prism_preprocessing(in_dir_path=in_dir_path, 
                    out_dir_path=out_dir_path,
                    compute_secondary_structure=False)

Number of files to process: 173
Processing file: AARS_K562.tsv
File number 1 processed
...
Processing file: AATF_K562.tsv
File number 2 processed
...
Processing file: ABCF1_K562.tsv
File number 3 processed
...
Processing file: AGGF1_HepG2.tsv
File number 4 processed
...
Processing file: AGO_HEK293.tsv
File number 5 processed
...
Processing file: AKAP1_K562.tsv
File number 6 processed
...
Processing file: AKAP8L_K562.tsv
File number 7 processed
...
Processing file: ALKBH5_HEK293.tsv
File number 8 processed
...
Processing file: APOBEC3C_K562.tsv
File number 9 processed
...
Processing file: AQR_HepG2.tsv
File number 10 processed
...
Processing file: ATXN2_HEK293T.tsv
File number 11 processed
...
Processing file: AUH_K562.tsv
File number 12 processed
...
Processing file: BCCIP_HepG2.tsv
File number 13 processed
...
Processing file: BCLAF1_HepG2.tsv
File number 14 processed
...
Processing file: BUD13_HepG2.tsv
File number 15 processed
...
Processing file: C17ORF85_HEK293.tsv
File number 16 

In [9]:
names = ["chromosome", "sequence", "label"] #add secondary structure and/or conservation scores if added to the data

prism_protein = pd.read_csv("../Data/prisment/processed/negatives_ls/AARS_K562.tsv",  delimiter="\t", header=None, names=names)
prism_protein["sequence"] = prism_protein.sequence.apply(lambda x: x.upper())
print(f"Sequence length: {len(prism_protein.sequence[0])}")
prism_protein.head()

Sequence length: 101


Unnamed: 0,chromosome,sequence,label
0,ENST00000406567|1221|1321,ATTATTGTGACCATCCTGCTGCTCCAGAGTGCCTTTCCAGGTTTCC...,0
1,ENST00000415131|489|589,TCAGAATATTTTGTAATGAAAGGATCTAGAAAGCAACTTGGAAGTG...,0
2,ENST00000373237|1936|2036,TAGATTTATTTATTTATTTAGAGACAGGGTCTCACTCTAGCCCAGG...,0
3,ENST00000409913|1687|1787,TCTGAAAACTAGGCGGAGGTTTCCAAATCTCAATTCTTGACTTCTG...,0
4,ENST00000459956|489|589,CCAAGACCAAGAAGAAGAAGGCTGAGGGCACCGTGTTCACCGAGGA...,0
