In [1]:
import sys
sys.path.append('./src')

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [None]:
from data.dataloaders import LitCoverageDatasetHDF5
from util.load_config import config

### Loading Metadata
For every output class, I have calculated different quality metrics (`*_QC.tsv`) and extracted relevant metadata (`*_metadata_processed.tsv`).

In [7]:
md_qc = pd.read_csv('/fast/AG_Ohler/remo/nucleotran/220407_hg38_roi_mincov2_nc10_QC.tsv', sep='\t')
md = pd.read_csv('../../data/processed/221111_encode_metadata_processed.tsv', sep='\t')

In [8]:
md.rename(columns={c: c.replace('.','_') for c in md.columns}, inplace=True)
md_qc.rename(columns={c: c.replace('.','_') for c in md.columns}, inplace=True)
md_qc.rename(columns={'Unnamed: 0': 'File_accession'}, inplace=True)
# 221111_encode_metadata_processed.tsv contains information about the different classes
# later I would like to incorporate this metadata into the learning process
md.head()

Unnamed: 0,File_accession,Biosample_term_name,Biosample_organ_slims,Biosample_system_slims,Biosample_developmental_slims,Biosample_organism,proc_Assay_lvl1,proc_target,Experiment_date_released,Library_lab,spot2_score,five_percent_narrowpeaks_count,frip,reproducible_peaks,proc_age_bin,proc_age_bin_units,proc_Biosample_life_stage
0,ENCFF007FAW,gastroesophageal sphincter,"musculature of body,stomach","digestive system,musculature",endoderm,Homo sapiens,ATAC-seq,Accessible DNA,2016-06-15,/labs/michael-snyder/,,,0.220021,148398.0,50,year,adult
1,ENCFF009YES,left colon,"colon,intestine,large intestine",digestive system,endoderm,Homo sapiens,ATAC-seq,Accessible DNA,2020-08-17,/labs/michael-snyder/,,,0.55763,260427.0,45,year,adult
2,ENCFF010JOP,adrenal gland,"adrenal gland,endocrine gland",endocrine system,"ectoderm,mesoderm",Homo sapiens,ATAC-seq,Accessible DNA,2016-10-31,/labs/michael-snyder/,,,0.282873,213082.0,50,year,adult
3,ENCFF012BKZ,heart right ventricle,heart,circulatory system,mesoderm,Homo sapiens,ATAC-seq,Accessible DNA,2020-08-05,/labs/michael-snyder/,,,0.320655,237529.0,60,year,adult
4,ENCFF018EMP,sigmoid colon,"colon,intestine,large intestine",digestive system,endoderm,Homo sapiens,ATAC-seq,Accessible DNA,2016-06-15,/labs/michael-snyder/,,,0.128024,123460.0,35,year,adult


In [14]:
# 220407_hg38_roi_mincov2_nc10_QC.tsv contains quality metrics....
# these could be useful for evaluation
md_qc.head()

Unnamed: 0,File_accession,N_peaks,bp_covered,non_zero,score_max,score_mean,score_median,score_min,score_q25,score_q75,...,TSS_lncRNA_enrich,TSS_protein_coding_enrich,TSS_rRNA_enrich,promoter_lncRNA_enrich,promoter_protein_coding_enrich,promoter_rRNA_enrich,CDS_enrich,five_prime_UTR_enrich,three_prime_UTR_enrich,intron_enrich
0,ENCFF003IMJ,28868.0,12878538.0,0.006282,18.1187,5.255017,5.0158,2.04119,4.2663,5.92776,...,1.075011,0.511839,0.0,0.933792,0.624095,0.0,2.047153,0.992063,2.064651,0.711246
1,ENCFF003KOQ,270663.0,59051563.0,0.028561,19.6766,0.743396,0.394735,0.007519,0.236841,0.78571,...,6.383012,8.532097,2.121995,2.981906,3.052459,0.567541,1.462436,7.948689,1.058709,1.100186
2,ENCFF003KVZ,3151.0,794841.0,0.000371,14.47028,3.343118,3.33129,1.86575,3.00153,3.681715,...,2.763362,3.730073,0.0,2.363636,2.952908,0.0,1.277884,4.135471,1.589795,1.242434
3,ENCFF003TGC,74706.0,96037308.0,0.046598,78.86799,7.510577,4.89327,1.74216,3.92062,7.60936,...,5.411217,8.217849,1.300609,3.748547,4.321479,0.72247,2.202078,9.579459,1.321944,1.191383
4,ENCFF004ANL,69167.0,111641447.0,0.054231,95.88441,11.268159,6.08826,1.92171,4.45677,11.942465,...,4.715368,7.12111,0.558776,3.403606,3.997887,0.34488,2.081526,8.259164,1.479355,1.346014


## Matching the metadata to the loaders

In [54]:
datamodule = LitCoverageDatasetHDF5(
            seq_order = 1, 
            seq_len = 2176,
            basepath="data/processed/GRCh38/toydata",
            ref_path = config['reference']['GRCh38'],
            batch_size = 128,
            random_shift = 3,
            random_reverse_complement = True,
        )

BED-file contains 10000 regions.
93.250% of regions have at least 1 label.


In [55]:
# this attribute contains the label IDs
datamodule.data.labelloader.label_ids

array(['ENCFF003IMJ', 'ENCFF003KOQ', 'ENCFF003KVZ', ..., 'ENCFF998LIO',
       'ENCFF998YYO', 'ENCFF999UGV'], dtype=object)

In [56]:
# this attribute contains the number of times a label appears in the whole dataset (training + validation + test) 
datamodule.data.labelloader.label_N

array([ 59, 276,   2, ...,  46,  63,   4])

In [58]:
# some labels are so rare that they are not actually part of the toy dataset...
print(sum(datamodule.data.labelloader.label_N == 0))

np.sort(datamodule.data.labelloader.label_N / len(datamodule.data.labelloader))

11


array([0.    , 0.    , 0.    , ..., 0.1198, 0.1263, 0.1337])

In [59]:
md.set_index('File_accession',drop=False,inplace=True)

In [60]:
md = md.loc[datamodule.data.labelloader.label_ids]

In [61]:
md.shape

(2106, 17)

## Description of the metadata columns

## Make experiment groppings for metrics logs

In [11]:
md['proc_target'].unique()

array(['Accessible DNA', 'CTCF', 'H3K27ac', 'H3K27me3', 'H3K36me3',
       'H3K4me1', 'H3K4me3', 'H3K9ac', 'H3K9me3'], dtype=object)

In [15]:
md['proc_Assay_lvl1'].unique()

array(['ATAC-seq', 'ChIP-seq', 'DNase-seq'], dtype=object)

In [22]:
groupping = []
for i in range(len(md)):
    if md['proc_Assay_lvl1'][i] == 'ATAC-seq':
        groupping.append((i, 'ATAC-seq'))
    elif md['proc_Assay_lvl1'][i] == 'DNase-seq':
        groupping.append((i, 'DNase-seq'))
    else:
        groupping.append((i, md['proc_Assay_lvl1'][i] + ' ' + md['proc_target'][i]))
        

In [25]:
import csv

In [32]:
with open('../../src/data/metadata_gropus_assay.csv','w') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['n_experiment','group'])
    for row in groupping:
        csv_out.writerow(row)