In [65]:
DIR_PATH = '/mnt/stripe/bio/experiments/blueprint/data'

# Blueprint DMR processing

## Input: 6 methylcall files

In [62]:
!ls -alh $DIR_PATH/mincov0

total 6.4G
drwxrwxr-x  2 user user 4.0K Jul 20 17:48 .
drwxr-xr-x 12 user user 4.0K Aug  1 14:19 ..
-rw-rw-r--  1 user user 1.1G Jul  9 20:39 methylcall.CpG.C000S5A1bs.GRCh38.20160531.mincov0.txt
-rw-rw-r--  1 user user 1.1G Jul  9 20:43 methylcall.CpG.C0010KA2bs.GRCh38.20160531.mincov0.txt
-rw-rw-r--  1 user user 1.1G Jul  9 20:37 methylcall.CpG.C001UYA3bs.GRCh38.20160531.mincov0.txt
-rw-rw-r--  1 user user 1.1G Jul  9 20:35 methylcall.CpG.C004SQ51.GRCh38.20160531.mincov0.txt
-rw-rw-r--  1 user user 1.1G Jul  9 20:45 methylcall.CpG.C005PS51.GRCh38.20160531.mincov0.txt
-rw-rw-r--  1 user user 1.1G Jul  9 20:41 methylcall.CpG.S000RD54.GRCh38.20160531.mincov0.txt


## Gather cytosine coverage

In [None]:
%%R
#gather_cytosine_data.R
library(readr)
library(dplyr)
library(stringr)
library(argparse)

# parse command line arguments
parser <- ArgumentParser()
parser$add_argument('-d', '--inputDirectory', help = 'input directory', required = TRUE)
parser$add_argument('-p', '--prefix', help = 'file name prefix', required = TRUE)
parser$add_argument('-o', '--output', help = 'output file name', required = TRUE)
args <- parser$parse_args()

# get file info
input_directory <- args$inputDirectory
output_file <- args$output
prefix <- args$prefix

meth_files <- list.files(input_directory)
is_initializes <- FALSE
for (file in meth_files) {
  print(file)
  tag <- str_split(file, '\\.', simplify = T)[1, 3]
  if (is_initializes) {
    current_df <- read_tsv(paste0(input_directory, '/', file)) %>%
      select(chrBase, coverage)
    tagged_names <- c('chrBase', paste0('coverage.', tag))
    colnames(current_df) <- tagged_names
    df <- full_join(df, current_df, by = 'chrBase')
  } else {
    df <- read_tsv(paste0(input_directory, '/', file)) %>%
      select(chrBase, coverage)
    tagged_names <- c('chrBase', paste0('coverage.', tag))
    colnames(df) <- tagged_names
    is_initializes <- TRUE
  }
}

write_tsv(df, output_file)

In [19]:
!Rscript --vanilla $DIR_PATH/gather_cytosine_data.R -d $DIR_PATH/mincov0 -p any -o $DIR_PATH/coverage_df_2.tsv

package ‘readr’ was built under R version 3.4.3 

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

package ‘dplyr’ was built under R version 3.4.3 
package ‘stringr’ was built under R version 3.4.3 
[1] "methylcall.CpG.C000S5A1bs.GRCh38.20160531.mincov0.txt"
Parsed with column specification:
cols(
  chrBase = col_character(),
  chr = col_character(),
  base = col_integer(),
  strand = col_logical(),
  coverage = col_integer(),
  freqC = col_double(),
  freqT = col_double()
)
[1] "methylcall.CpG.C0010KA2bs.GRCh38.20160531.mincov0.txt"
Parsed with column specification:
cols(
  chrBase = col_character(),
  chr = col_character(),
  base = col_integer(),
  strand = col_logical(),
  coverage = col_integer(),
  freqC = col_double(),
  freqT = col_double()
)
[1] "methylcall.CpG.C001UYA3bs.GRCh38.20160531.mincov0.txt"
Parsed with column specificatio

## Filter cytosine coverage

In [None]:
%%R
library(readr)
library(dplyr)
library(argparse)

# parse command line arguments
parser <- ArgumentParser()
parser$add_argument('-i', '--input', help = 'input', required = TRUE)
parser$add_argument('-o', '--output', help = 'output file name', required = TRUE)
args <- parser$parse_args()

input <- args$input
output_file <- args$output

# subset rows: mean(coverage) >= 10
coverage_df <- read_tsv(input)
coverage_matrix <- as.matrix(select(coverage_df, -chrBase))
coverage_matrix[is.na(coverage_matrix)] <- 0
covered_idx <- which(rowMeans(coverage_matrix) >= 10)
coverage_df <- coverage_df[covered_idx, ]
write_tsv(coverage_df, output_file)

In [22]:
!Rscript --vanilla $DIR_PATH/filter_mean_10.R -i $DIR_PATH/coverage_df_2.tsv -o $DIR_PATH/coverage_df_2_filtered_mean_10.tsv

package ‘readr’ was built under R version 3.4.3 

Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

package ‘dplyr’ was built under R version 3.4.3 
Parsed with column specification:
cols(
  chrBase = col_character(),
  coverage.C000S5A1bs = col_integer(),
  coverage.C0010KA2bs = col_integer(),
  coverage.C001UYA3bs = col_integer(),
  coverage.C004SQ51 = col_integer(),
  coverage.C005PS51 = col_integer(),
  coverage.S000RD54 = col_integer()
)


## Convert to cytosine regions

In [69]:
!awk -F'[.\t]' '{if(NR!=1){print $1"\t"$2"\t"$2}}' /mnt/stripe/bio/experiments/blueprint/data/coverage_df_2_filtered_mean_10.tsv > /mnt/stripe/bio/experiments/blueprint/data/mean_10_covered_3.bed

In [71]:
!ls -alh $DIR_PATH/mean_10_covered_3.bed

-rw-rw-r-- 1 user user 606M Aug  3 17:54 /mnt/stripe/bio/experiments/blueprint/data/mean_10_covered_3.bed


## Run pipeline

In [None]:
%%bash
#!/bin/bash

WDIR='/mnt/stripe/bio/experiments/blueprint/data/'
cd $WDIR

for full_file in $(find mincov0 -maxdepth 1 -mindepth 1 -type f);
do
    name=$(basename $full_file)
    name=${name%????};
    out_name=sorted_data/${name}.sorted.txt
    echo $out_name
    head -1 $full_file > $out_name
    tail -n+2 $full_file | sed -e 's/ \+/\t/g' | sort -k2,2 -k3,3n  >> $out_name
done

In [28]:
!bash $DIR_PATH/sort_data.sh

sorted_data/methylcall.CpG.C000S5A1bs.GRCh38.20160531.mincov0.sorted.txt
sorted_data/methylcall.CpG.C0010KA2bs.GRCh38.20160531.mincov0.sorted.txt
sorted_data/methylcall.CpG.C001UYA3bs.GRCh38.20160531.mincov0.sorted.txt
sorted_data/methylcall.CpG.C004SQ51.GRCh38.20160531.mincov0.sorted.txt
sorted_data/methylcall.CpG.C005PS51.GRCh38.20160531.mincov0.sorted.txt
sorted_data/methylcall.CpG.S000RD54.GRCh38.20160531.mincov0.sorted.txt


In [50]:
!ls -alh $DIR_PATH/sorted_data

total 6.4G
drwxrwxr-x  2 user user 4.0K Jul 24 17:11 .
drwxr-xr-x 10 user user 4.0K Aug  1 13:50 ..
-rw-rw-r--  1 user user 1.1G Jul 31 15:02 methylcall.CpG.C000S5A1bs.GRCh38.20160531.mincov0.sorted.txt
-rw-rw-r--  1 user user 1.1G Jul 31 15:15 methylcall.CpG.C0010KA2bs.GRCh38.20160531.mincov0.sorted.txt
-rw-rw-r--  1 user user 1.1G Jul 31 15:29 methylcall.CpG.C001UYA3bs.GRCh38.20160531.mincov0.sorted.txt
-rw-rw-r--  1 user user 1.1G Jul 31 15:43 methylcall.CpG.C004SQ51.GRCh38.20160531.mincov0.sorted.txt
-rw-rw-r--  1 user user 1.1G Jul 31 15:56 methylcall.CpG.C005PS51.GRCh38.20160531.mincov0.sorted.txt
-rw-rw-r--  1 user user 1.1G Jul 31 16:10 methylcall.CpG.S000RD54.GRCh38.20160531.mincov0.sorted.txt


In [None]:
# prepare_methfiles.py
import argparse
import os

# run example:
# bash run_python3.sh scripts/python/prepare_methfiles.py '-i /scratch/shchukinai/aging/methylation/sorted_data/
# -c /scratch/shchukinai/aging/methylation/cytosines.bed -p methylcall -s minavcov10.sorted
# -o /scratch/shchukinai/aging/methylation/clean_methpipe/data/' methfiles

def convert_line_to_methpipe_format(line):
    id, chr, pos, strand, coverage, freqC, freqT = line.strip().split('\t')
    if strand == 'F':
        strand = '+'
    else:
        strand = '-'
    string = '\t'.join([chr, pos, strand, 'CpG', str(float(freqC) / 100.0), coverage]) + '\n'
    return string


def prepare_methfiles(cytosine_set, folder, prefix, suffix, output_dir):
    # go though input directory and filter all files
    files = [os.path.join(folder, file_name) for file_name in os.listdir(folder) if file_name.startswith(prefix)]
    for file_name in files:
        output_name = file_name.split('/')[-1].split('.')[:3]
        output_name.extend([suffix, 'meth'])
        output_name = os.path.join(output_dir, '.'.join(output_name))
        with open(output_name, 'w') as out:
            with open(file_name) as inp:
                is_header = True
                for line in inp:
                    # skip header
                    if is_header:
                        is_header = False
                        continue
                    chr_base = line.split()[0]
                    # skip cytosines that are not in the filtered set
                    if chr_base not in cytosine_set:
                        continue
                    out.write(convert_line_to_methpipe_format(line))


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--inputDirectory', required=True, help='directory with initial sorted files')
    parser.add_argument('-p', '--prefix', help="name prefix for methylation data files", required=True)
    parser.add_argument('-c', '--cytosine', help="BED file with cytosines set", required=True)
    parser.add_argument('-s', '--suffix', help="suffix to add", required=True)
    parser.add_argument('-o', '--outputDirectory', help='output directory')
    args = parser.parse_args()

    directory = args.inputDirectory
    prefix = args.prefix
    set_file = args.cytosine
    output = args.outputDirectory
    suffix = args.suffix

    # read in cytosines to keep
    cytosine_set = set()
    with open(set_file) as inp:
        for line in inp:
            chr, start, end = line.strip().split()
            cytosine_set.add('.'.join([chr, start]))

    prepare_methfiles(cytosine_set, directory, prefix, suffix, output)


if __name__ == "__main__":
    main()

In [72]:
!rm -rf $DIR_PATH/prepare_output_3
!mkdir $DIR_PATH/prepare_output_3
!python $DIR_PATH/prepare_methfiles.py -i $DIR_PATH/sorted_data -p methylcall -s minavcov0.sorted -c $DIR_PATH/mean_10_covered_3.bed -o $DIR_PATH/prepare_output_3


In [73]:
!mv $DIR_PATH/prepare_output_3/methylcall.CpG.C005PS51.minavcov0.sorted.meth $DIR_PATH/prepare_output_3/methylcall.CpG.YD1.C005PS51.minavcov0.sorted.meth
!mv $DIR_PATH/prepare_output_3/methylcall.CpG.S000RD54.minavcov0.sorted.meth $DIR_PATH/prepare_output_3/methylcall.CpG.YD2.S000RD54.minavcov0.sorted.meth

!mv $DIR_PATH/prepare_output_3/methylcall.CpG.C000S5A1bs.minavcov0.sorted.meth $DIR_PATH/prepare_output_3/methylcall.CpG.OD1.C000S5A1bs.minavcov0.sorted.meth
!mv $DIR_PATH/prepare_output_3/methylcall.CpG.C0010KA2bs.minavcov0.sorted.meth $DIR_PATH/prepare_output_3/methylcall.CpG.OD2.C0010KA2bs.minavcov0.sorted.meth
!mv $DIR_PATH/prepare_output_3/methylcall.CpG.C001UYA3bs.minavcov0.sorted.meth $DIR_PATH/prepare_output_3/methylcall.CpG.OD3.C001UYA3bs.minavcov0.sorted.meth
!mv $DIR_PATH/prepare_output_3/methylcall.CpG.C004SQ51.minavcov0.sorted.meth $DIR_PATH/prepare_output_3/methylcall.CpG.OD4.C004SQ51.minavcov0.sorted.meth


In [77]:
!ls -alh $DIR_PATH/prepare_output_3

total 4.7G
drwxrwxr-x  2 user user 4.0K Aug  6 14:43 .
drwxr-xr-x 14 user user 4.0K Aug  6 14:41 ..
-rw-rw-r--  1 user user 800M Aug  6 14:23 methylcall.CpG.OD1.C000S5A1bs.minavcov0.sorted.meth
-rw-rw-r--  1 user user 797M Aug  6 14:25 methylcall.CpG.OD2.C0010KA2bs.minavcov0.sorted.meth
-rw-rw-r--  1 user user 805M Aug  6 14:26 methylcall.CpG.OD3.C001UYA3bs.minavcov0.sorted.meth
-rw-rw-r--  1 user user 803M Aug  6 14:28 methylcall.CpG.OD4.C004SQ51.minavcov0.sorted.meth
-rw-rw-r--  1 user user 800M Aug  6 14:30 methylcall.CpG.YD1.C005PS51.minavcov0.sorted.meth
-rw-rw-r--  1 user user 805M Aug  6 14:32 methylcall.CpG.YD2.S000RD54.minavcov0.sorted.meth


In [None]:
#prepare_methpipe_input.py
import argparse
import os
import subprocess

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--inputDataDirectory', help='directory with .meth files', required=True)
    parser.add_argument('-n', '--name', help="name to use for table and design", required=True)
    parser.add_argument('-o', '--outputDirectory', help='output directory')
    args = parser.parse_args()

    data_directory = args.inputDataDirectory
    name_prefix = args.name
    output_directory = args.outputDirectory

    # use Methpipe to prepare table
    table_name = os.path.join(output_directory, name_prefix + '_proportion_table.txt')
    methfiles = [os.path.join(data_directory, file_name) for file_name in os.listdir(data_directory)]
    #subprocess.run('module load gcc-4.7.2', shell=True)
    subprocess.run('merge-methcounts -t ' + ' '.join(methfiles) + ' > {}'.format(table_name), shell=True)

    # construct design table using proportion table header
    with open(table_name) as inp:
        prop_header = inp.readline().strip().split()
    design_name = os.path.join(output_directory, name_prefix + '_design_matrix_complex.txt')
    with open(design_name, 'w') as out:
        out.write('\t'.join(['base', 'case', 'batch']) + '\n')
        base = '1'
        for methfile in prop_header:
            case = '0'
            batch = '0'
            id = methfile.split('.')[2]
            if id.startswith('OD'):
                case = '1'
            if int(id[2:]) > 10:
                batch = '1'
            out.write('\t'.join([methfile, base, case, batch]) + '\n')


if __name__ == "__main__":
    main()

In [78]:
!rm -rf $DIR_PATH/methpipe_input_3
!mkdir $DIR_PATH/methpipe_input_3
!python $DIR_PATH/prepare_methpipe_input.py -i $DIR_PATH/prepare_output_3/ -o $DIR_PATH/methpipe_input_3/ -n methpipe_input_3
!ls -alh $DIR_PATH/methpipe_input_3

total 1.5G
drwxrwxr-x  2 user user  111 Aug  6 14:49 .
drwxr-xr-x 14 user user 4.0K Aug  6 14:43 ..
-rw-rw-r--  1 user user  328 Aug  6 14:49 methpipe_input_3_design_matrix_complex.txt
-rw-rw-r--  1 user user 1.5G Aug  6 14:49 methpipe_input_3_proportion_table.txt


In [None]:
#run_methpipe.sh
#!/bin/bash

DESIGN=$1
TABLE=$2
OUTPUT=$3
NAME=$4

#rm ${NAME}.log

WDIR=`pwd`
ADJ_OUTPUT="${OUTPUT%.*}".narrow.adjusted.bed
REG_OUTPUT="${ADJ_OUTPUT%.*}".regions.bed

cd $WDIR
radmeth regression -factor case -o $OUTPUT $DESIGN $TABLE
radmeth adjust -bins 1:50:1 $OUTPUT > $ADJ_OUTPUT
radmeth merge -p 0.05 $ADJ_OUTPUT > $REG_OUTPUT

In [None]:
!mkdir $DIR_PATH/methpipe_output_3
!bash $DIR_PATH/run_methpipe.sh $DIR_PATH/methpipe_input_3/methpipe_input_3_design_matrix_complex.txt $DIR_PATH/methpipe_input_3/methpipe_input_3_proportion_table.txt $DIR_PATH/methpipe_output_3/clean_methpipe.bed
!ls -alh $DIR_PATH/methpipe_output_3
