## Baseline with different train negative sets

In [2]:
import os
from pathlib import Path
import pandas as pd
from models.baseline import baseline, baseline_sklearn
from utils.utils import get_gc_count_from_ohe

In [2]:
rbp24_path = Path("../Data/rbp24/processed")
rbp31_path =  Path("../Data/rbp31")
out_rbp24 = Path("../Data/outputs/rbp24_baseline_results.tsv")
out_rbp31 = Path("../Data/outputs/rbp31_baseline_results.tsv")

In [5]:
results = []
for root, dirs, files in os.walk(rbp24_path):
    if len(dirs) == 2:
        protein_name = root.split("/")[-1]

    if root.endswith("test"):
        test_df = pd.read_csv(root + "/original.tsv.gz", delimiter="\t", index_col=0, compression="gzip")

    if root.endswith("train"):
        train_dfs = {}
        for file in files:
            if file.endswith("tsv.gz"):
                train_dfs[file.split(".")[0]] = pd.read_csv(root + "/" + file, delimiter="\t", index_col=0, compression="gzip")

        posGC, negGC = get_gc_count_from_ohe(train_dfs['original'])
        for data_type, train_df in train_dfs.items():
            test_acc, test_auc, _ = baseline_sklearn(train_df, test_df)
            results.append([protein_name, posGC, negGC, test_acc.item(), test_auc.item(), data_type])

results_df = pd.DataFrame(results, columns=["Protein", "PosGC", "NegGC", "TestACC", "TestAUC", "DataType"])
results_df.to_csv(out_rbp24, sep='\t')

### Baseline with different test sets

In [3]:
rbp24_path = Path("../Data/rbp24/processed")
rbp31_path =  Path("../Data/rbp31")
out_rbp24 = Path("../Data/outputs/rbp24_baseline_evals_new.tsv")
out_rbp31 = Path("../Data/outputs/rbp31_baseline_evals.tsv")

In [6]:
results = []
for root, dirs, files in os.walk(rbp24_path):
    if len(dirs) == 2:
        protein_name = root.split("/")[-1]

    if root.endswith("test"):
        test_dfs = {}
        for file in files:
            if file.endswith("tsv.gz"):
                #test_dfs[file.split(".")[0]] = pd.read_csv(root + "/" + file, delimiter="\t", index_col=0, compression="gzip")
                test_dfs["original"] = pd.read_csv(root + "/original.tsv.gz", delimiter="\t", index_col=0, compression="gzip")
                test_dfs["distributed"] = pd.read_csv(root + "/dist_neg_orig_pos.tsv.gz", delimiter="\t", index_col=0, compression="gzip")

    if root.endswith("train"):
        train_df = pd.read_csv(root + "/original.tsv.gz", delimiter="\t", index_col=0, compression="gzip")

        train_posGC, train_negGC = get_gc_count_from_ohe(train_df)
       
        for data_type, test_df in test_dfs.items():
            test_posGC, test_negGC = get_gc_count_from_ohe(test_df)
            test_acc, test_auc, _ = baseline_sklearn(train_df, test_df)
            results.append([protein_name, train_posGC, train_negGC, test_posGC, test_negGC, test_acc.item(), test_auc.item(), data_type])

results_df = pd.DataFrame(results, columns=["Protein", "Train_PosGC", "Train_NegGC", "Test_PosGC", "Test_NegGC", "TestACC", "TestAUC", "DataType"])
results_df.to_csv(out_rbp24, sep='\t')

### Lets see the whole workflow for single rbp protein

In [9]:
dir_dict, entry_count = process_dataset(rbp_dir_path=rbp_dir_path, prism_dir_path=prism_dir_path, dataset='rbp24')
content = []
count = 1

dir_dict

{'../Data/rbp24/processed/positives_train': ['ALKBH5_Baltz2012.tsv',
  'C17ORF85_Baltz2012.tsv',
  'C22ORF28_Baltz2012.tsv',
  'CAPRIN1_Baltz2012.tsv',
  'CLIPSEQ_AGO2.tsv',
  'CLIPSEQ_ELAVL1.tsv',
  'CLIPSEQ_SFRS1.tsv',
  'ICLIP_HNRNPC.tsv',
  'ICLIP_TDP43.tsv',
  'ICLIP_TIA1.tsv',
  'ICLIP_TIAL1.tsv',
  'PARCLIP_AGO1234.tsv',
  'PARCLIP_ELAVL1.tsv',
  'PARCLIP_ELAVL1A.tsv',
  'PARCLIP_EWSR1.tsv',
  'PARCLIP_FUS.tsv',
  'PARCLIP_HUR.tsv',
  'PARCLIP_IGF2BP123.tsv',
  'PARCLIP_MOV10_Sievers.tsv',
  'PARCLIP_PUM2.tsv',
  'PARCLIP_QKI.tsv',
  'PARCLIP_TAF15.tsv',
  'ZC3H7B_Baltz2012.tsv'],
 '../Data/rbp24/processed/negatives_ls': ['ALKBH5_Baltz2012.tsv',
  'C17ORF85_Baltz2012.tsv',
  'C22ORF28_Baltz2012.tsv',
  'CAPRIN1_Baltz2012.tsv',
  'CLIPSEQ_AGO2.tsv',
  'CLIPSEQ_ELAVL1.tsv',
  'CLIPSEQ_SFRS1.tsv',
  'ICLIP_HNRNPC.tsv',
  'ICLIP_TDP43.tsv',
  'ICLIP_TIA1.tsv',
  'ICLIP_TIAL1.tsv',
  'PARCLIP_AGO1234.tsv',
  'PARCLIP_ELAVL1.tsv',
  'PARCLIP_ELAVL1A.tsv',
  'PARCLIP_EWSR1.tsv',
  'PAR

In [22]:
i = 20
protein_name = dir_dict['../Data/rbp24/processed/negatives_ls'][i]
print(f"Protein name: {protein_name}")

Protein name: PARCLIP_QKI.tsv


In [23]:
dfs = csv_to_df(dir_dict, i)
pos_train, neg_train, pos_ls, neg_ls = dfs
pos_train.head()

Unnamed: 0,name,seq,label
0,chr1:1102484-1102612(),CAGCTCGGGCAGCCGTGGCCATCTTACTGGGCAGCATTGGATGGAG...,1
1,chr1:1103242-1103370(),CCGGGCCCCTGTGAGCATCTTACCGGACAGTGCTGGATTTCCCAGC...,1
2,chr1:1104381-1104509(),CACCGCCGGCCGATGGGCGTCTTACCAGACATGGTTAGACCTGGCC...,1
3,chr1:2241351-2241479(),CTTTAAGTCAGGAGTCACAAATGACTTTTTTTTTTCAATTAAGGAA...,1
4,chr1:3762198-3762326(),TATAACAAACTCTGACCTACACTGTTatcaaatgggatgatgtata...,1


In [24]:
neg_train.head()

Unnamed: 0,name,seq,label
0,chr1:3661249-3661377(),CTAGTAAGAGCATGTCCAAACCTGGACACACCAAAGGTGAGCCAGG...,0
1,chr1:3745688-3745816(),AAACTAACCTAAAAAACCAAAAGGAATCAAGACGCTCATTTGCATA...,0
2,chr1:6650934-6651062(),GGGCGTCCTTTATTACATACGCGTCTCTGAAGTCATATAAATATAG...,0
3,chr1:6659475-6659603(),CACACGCAGTCATAGAGCCGGGAGCCATCGGACCCACCTGCCAGGA...,0
4,chr1:6747052-6747180(),GTCCATATGTTCCCCAATAATCTGATGGAAAGAAACTGCATCAGCT...,0


In [25]:
datasets = augment_negatives(dfs)
datasets.keys()

    Augmenting negatives...


dict_keys(['original', 'shuffled', 'sameGC'])

#### Explore given datasets with new negatives

In [26]:
#Sequences in Test sets should be same throughout all datasets
print(datasets['original'][1].iloc[2].seq.upper())
print(datasets['shuffled'][1].iloc[2].seq.upper())
print(datasets['sameGC'][1].iloc[2].seq.upper())

TGGTTTATTTAACTCAGGATCTGCCATTTGTCTTCTGTGTTTTGCACTCACCTTTTTAACTTCCTAACACCACATTTATAATTATAGATTTTTCTTTCTATGTCCTTCCTTATGTCTGCACCTTCTGC
TGGTTTATTTAACTCAGGATCTGCCATTTGTCTTCTGTGTTTTGCACTCACCTTTTTAACTTCCTAACACCACATTTATAATTATAGATTTTTCTTTCTATGTCCTTCCTTATGTCTGCACCTTCTGC
TGGTTTATTTAACTCAGGATCTGCCATTTGTCTTCTGTGTTTTGCACTCACCTTTTTAACTTCCTAACACCACATTTATAATTATAGATTTTTCTTTCTATGTCCTTCCTTATGTCTGCACCTTCTGC


In [35]:
#In training sets positives should be same in original and sameGC datasets, shuffled should be different
print(datasets['original'][0][datasets['original'][0].name=='chr1:1102484-1102612()'].seq)
print(datasets['sameGC'][0][datasets['sameGC'][0].name=='chr1:1102484-1102612()'].seq)
print(datasets['shuffled'][0][datasets['shuffled'][0].name=='chr1:1102484-1102612()'].seq)

4915    CAGCTCGGGCAGCCGTGGCCATCTTACTGGGCAGCATTGGATGGAG...
Name: seq, dtype: object
10009    CAGCTCGGGCAGCCGTGGCCATCTTACTGGGCAGCATTGGATGGAG...
Name: seq, dtype: object
148    TGGGGTCGGCGTCATTGACCTGGGCAGCCCCCCCGAGCCCGGGCGA...
Name: seq, dtype: object


In [28]:
#Negatives in training set should be different in all datasets
print(datasets['original'][0][datasets['original'][0].name=='chr1:3745688-3745816()'].seq)
print(datasets['sameGC'][0][datasets['sameGC'][0].name=='chr1:3745688-3745816()'].seq)
print(datasets['shuffled'][0][datasets['shuffled'][0].name=='chr1:3745688-3745816()'].seq)

4263    AAACTAACCTAAAAAACCAAAAGGAATCAAGACGCTCATTTGCATA...
Name: seq, dtype: object
9724    GCACCACCGTCTGGTAAGAGCCTCGGGCATCCTGTTGTCGAAGGCA...
Name: seq, dtype: object
12226    ATAACGATTCTATTGAAAGACACAGAGTTTAAAGAGTATCTTGGTA...
Name: seq, dtype: object
