https://www.cell.com/molecular-cell/pdf/S1097-2765(22)00900-5.pdf

# Alignment task - Running `STAR`

In [9]:
cat star.sh

index=$1
bamDIR=$2
JOBS=$3

mkdir -p ${bamDIR}
mkdir -p ${bamDIR}_star_qc

STAR --genomeLoad LoadAndExit --genomeDir $index

for fq in fastq/*R1*; do
    fq=`basename $fq`
    out=${fq/_R1*/}
    echo '------------' $out '-----------'
    STAR \
    --outSAMtype BAM SortedByCoordinate \
    --readFilesCommand zcat \
    --runThreadN $JOBS \
    --genomeDir $index \
    --readFilesIn fastq/$fq \
    --outFileNamePrefix ${bamDIR}/$out
    
    mv -v ${bamDIR}/${out}Aligned.sortedByCoord.out.bam ${bamDIR}/${out}.bam
    mv -v ${bamDIR}/${out}Log.final.out ${bamDIR}_star_qc/
    rm -v ${bamDIR}/${out}*out*
    rm -rv ${bamDIR}/${out}_STARtmp/
    
done

STAR --genomeLoad Remove --genomeDir $index

rm -r _STARtmp/ Log.out Log.progress.out Aligned.out.sam


In [3]:
%%bash
for STARindex in `ls -d ~/tools/HERVs/files/*_star_index/`; do
    name=`basename $STARindex`; name=${name/package-entities-/}; name=${name/_star_index/};
    echo $name
    # nohup bash star.sh $STARindex align/bam_${name} 30 &> align/bam_${name}.out;
    wait;
done

erv
line
rc
retroposon
satellite
scrna
sine
snrna
trna


___

To get only the mapped reads – https://www.biostars.org/p/56246/

In [None]:
%%bash
for STARindex in `ls -d ~/tools/HERVs/files/*_star_index/`; do
    name=`basename $STARindex`; name=${name/package-entities-/}; name=${name/_star_index/};
    echo $name
    for bam in align/bam_${name}/*.bam; do
        echo $bam
        bam_mapped=${bam/.bam/.mapped.bam};
        samtools view -b -F 4 $bam > $bam_mapped;
    done
done

In [17]:
%%bash
for STARindex in `ls -d ~/tools/HERVs/files/*_star_index/`; do
    name=`basename $STARindex`; name=${name/package-entities-/}; name=${name/_star_index/};
    echo $name
    for bam_mapped in align/bam_${name}/*mapped.bam; do
        bam=${bam_mapped/.mapped.bam/.bam};
        mv -v $bam_mapped $bam
    done
done

erv
‘align/bam_erv/T1.input.mapped.bam’ -> ‘align/bam_erv/T1.input.bam’
‘align/bam_erv/T1.m6A.mapped.bam’ -> ‘align/bam_erv/T1.m6A.bam’
‘align/bam_erv/T2.input.mapped.bam’ -> ‘align/bam_erv/T2.input.bam’
‘align/bam_erv/T2.m6A.mapped.bam’ -> ‘align/bam_erv/T2.m6A.bam’
‘align/bam_erv/U1.input.mapped.bam’ -> ‘align/bam_erv/U1.input.bam’
‘align/bam_erv/U1.input.mapped.mapped.bam’ -> ‘align/bam_erv/U1.input.mapped.bam’
‘align/bam_erv/U1.m6A.mapped.bam’ -> ‘align/bam_erv/U1.m6A.bam’
‘align/bam_erv/U2.input.mapped.bam’ -> ‘align/bam_erv/U2.input.bam’
‘align/bam_erv/U2.m6A.mapped.bam’ -> ‘align/bam_erv/U2.m6A.bam’
line
‘align/bam_line/T1.input.mapped.bam’ -> ‘align/bam_line/T1.input.bam’
‘align/bam_line/T1.m6A.mapped.bam’ -> ‘align/bam_line/T1.m6A.bam’
‘align/bam_line/T2.input.mapped.bam’ -> ‘align/bam_line/T2.input.bam’
‘align/bam_line/T2.m6A.mapped.bam’ -> ‘align/bam_line/T2.m6A.bam’
‘align/bam_line/U1.input.mapped.bam’ -> ‘align/bam_line/U1.input.bam’
‘align/bam_line/U1.m6A.mapped.bam’ -> ‘

# Peak calling task - Running `exomePeak2`

https://bioconductor.org/packages/release/bioc/vignettes/exomePeak2/inst/doc/Vignette_V_2.00.html

`exomepeak2` env

In [1]:
suppressMessages(suppressWarnings(library(exomePeak2)))

In [2]:
suppressMessages(suppressWarnings(library (GenomicFeatures)))

In [2]:
suppressMessages(suppressWarnings(library (tidyverse)))

In [3]:
# suppressMessages(suppressWarnings(library (Guitar)))

In [4]:
options(digits=5)

In [5]:
mkdir <- function (output_dir){
    if (!dir.exists(output_dir)){
        dir.create(output_dir)
        print(output_dir)
        print("created!")
    } else {
        print(output_dir)
        print("already exists!")
    }
}

In [7]:
runexomepeak <- function(GENE_ANNO_GTF,OUTPUT,bamDIR){
    
    ############################### read meta ###############################
    Samples = c("U1","U2")
    TREATED_Samples = c('T1','T2')

    IP = '.m6A'
    INPUT = '.input'

    OUTPUT = 'exomepeak'

    Samples = unlist(Samples)

    IP_BAM = paste(Samples, IP, '.bam', sep='')
    INPUT_BAM = paste(Samples, INPUT, '.bam', sep='')

    TREATED_IP_BAM = paste(TREATED_Samples, IP, '.bam', sep='')
    TREATED_INPUT_BAM = paste(TREATED_Samples, INPUT, '.bam', sep='')
    ############################### run exomepeak ###########################
    txdb  = makeTxDbFromGFF(GENE_ANNO_GTF,organism='Homo sapiens')
    
    setwd(bamDIR)
    
    res = exomePeak2(bam_ip = IP_BAM,
                     bam_input = INPUT_BAM,
                     bam_ip_treated = TREATED_IP_BAM,
                     bam_input_treated = TREATED_INPUT_BAM,
                     txdb = txdb,
                     parallel = 15
                    )
    setwd('../..')   
    
    return (res)    
}

### erv

In [76]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-erv.gtf.gz',
    OUTPUT = 'exomepeak2/erv',
    bamDIR = 'align/bam_erv'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 


In [None]:
saveRDS(res, 'exomepeak2/erv/results.rds')

### line

In [84]:
rm(res)

In [87]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-line.gtf.gz',
    OUTPUT = 'exomepeak2/line',
    bamDIR = 'align/bam_line'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 


In [None]:
mkdir('exomepeak2/line')

In [None]:
saveRDS(res, 'exomepeak2/line/results.rds')

### rc

In [22]:
setwd('../../')

In [23]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-rc.gtf.gz',
    OUTPUT = 'exomepeak2/rc',
    bamDIR = 'align/bam_rc'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 
OK

Count reads on bin features ... 
OK

Identify background features ... 
OK

Estimate sample sepecific size factors from the background ... 
OK

Detect peaks with GLM ... 
OK

Count reads on peaks ... 
OK

Calculate offset matrix for peaks ... 
OK

Detect differentially modified peaks with interactive GLM ... 
OK

No significant peaks detected, result unsaved.



In [None]:
# saveRDS(res, 'exomepeak2/rc/results.rds')

### retroposon

In [None]:
setwd('../../')

In [None]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-retroposon.gtf.gz',
    OUTPUT = 'exomepeak2/retroposon',
    bamDIR = 'align/bam_retroposon'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 
OK

Count reads on bin features ... 
OK

Identify background features ... 
OK

Estimate sample sepecific size factors from the background ... 
OK

Detect peaks with GLM ... 
OK

Count reads on peaks ... 


In [28]:
res

GRangesList object of length 206:
$`1`
GRanges object with 1 range and 0 metadata columns:
    seqnames            ranges strand
       <Rle>         <IRanges>  <Rle>
  1    chr14 20309837-20309886      +
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

$`2`
GRanges object with 1 range and 0 metadata columns:
    seqnames              ranges strand
       <Rle>           <IRanges>  <Rle>
  2     chr1 156058333-156058407      -
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

$`3`
GRanges object with 1 range and 0 metadata columns:
    seqnames              ranges strand
       <Rle>           <IRanges>  <Rle>
  3     chr1 156292215-156292239      -
  -------
  seqinfo: 24 sequences from an unspecified genome; no seqlengths

...
<203 more elements>

In [31]:
mkdir('exomepeak2/retroposon')

[1] "exomepeak2/retroposon"
[1] "created!"


In [32]:
saveRDS(res, 'exomepeak2/retroposon/results.rds')

In [38]:
res %>% 
    as.data.frame %>% 
    dplyr::select(c('seqnames','start','end','strand')) %>%
    write.table('exomepeak2/retroposon/results.bed',sep = '\t',quote = F, row.names = F)

### satellite

In [None]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-satellite.gtf.gz',
    OUTPUT = 'exomepeak2/satellite',
    bamDIR = 'align/bam_satellite'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 
OK

Count reads on bin features ... 
OK

Identify background features ... 
OK

Estimate sample sepecific size factors from the background ... 
OK

Detect peaks with GLM ... 
OK

Count reads on peaks ... 


In [43]:
mkdir('exomepeak2/satellite')

[1] "exomepeak2/satellite"
[1] "created!"


In [44]:
saveRDS(res, 'exomepeak2/satellite/results.rds')

In [45]:
res %>% 
    as.data.frame %>% 
    dplyr::select(c('seqnames','start','end','strand')) %>%
    write.table('exomepeak2/satellite/results.bed',sep = '\t',quote = F, row.names = F)

### scrna

In [None]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-scrna.gtf.gz',
    OUTPUT = 'exomepeak2/scrna',
    bamDIR = 'align/bam_scrna'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 
OK

Count reads on bin features ... 
OK

Identify background features ... 
OK

Estimate sample sepecific size factors from the background ... 
OK

Detect peaks with GLM ... 
OK

Count reads on peaks ... 


In [54]:
mkdir('exomepeak2/scrna')

[1] "exomepeak2/scrna"
[1] "created!"


In [56]:
res %>% 
    as.data.frame %>% 
    dplyr::select(c('seqnames','start','end','strand')) %>%
    write.table('exomepeak2/scrna/results.bed',sep = '\t',quote = F, row.names = F)

In [55]:
saveRDS(res, 'exomepeak2/scrna/results.rds')

### sine

In [None]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-sine.gtf.gz',
    OUTPUT = 'exomepeak2/sine',
    bamDIR = 'align/bam_sine'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 


In [66]:
getwd()

In [67]:
mkdir('exomepeak2/sine')

[1] "exomepeak2/sine"
[1] "created!"


In [68]:
res %>% 
    as.data.frame %>% 
    dplyr::select(c('seqnames','start','end','strand')) %>%
    write.table('exomepeak2/sine/results.bed',sep = '\t',quote = F, row.names = F)

In [69]:
saveRDS(res, 'exomepeak2/sine/results.rds')

### snrna

In [6]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-snrna.gtf.gz',
    OUTPUT = 'exomepeak2/snrna',
    bamDIR = 'align/bam_snrna'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 
OK

Count reads on bin features ... 
OK

Identify background features ... 
OK

Estimate sample sepecific size factors from the background ... 
OK

Detect peaks with GLM ... 
OK

Count reads on peaks ... 
OK

Calculate offset matrix for peaks ... 
OK

Detect differentially modified peaks with interactive GLM ... 
OK



#### 7SK methylation by METTL3 promotes transcriptional activity
https://www.science.org/doi/10.1126/sciadv.ade7500

In [None]:
saveRDS(res, 'exomepeak2/snrna/results.rds')

### trna

In [None]:
res = runexomepeak(
    GENE_ANNO_GTF = '~/tools/HERVs/files/package-entities-trna.gtf.gz',
    OUTPUT = 'exomepeak2/trna',
    bamDIR = 'align/bam_trna'
)

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK

“Reference genome not provided, GC content bias is left uncorrected.”
Extract bin features ... 
OK

Count reads on bin features ... 
OK

Identify background features ... 
OK

Estimate sample sepecific size factors from the background ... 
OK

Detect peaks with GLM ... 
OK

Count reads on peaks ... 


In [74]:
mkdir('exomepeak2/trna')

[1] "exomepeak2/trna"
[1] "created!"


In [75]:
saveRDS(res, 'exomepeak2/trna/results.rds')

___
## Save results into files

In [None]:
# %%R 
suppressMessages(suppressWarnings(library (tidyverse)))

In [None]:
# %%R 
gtf <- rtracklayer::import(GTF)

# gene2name <- gtf[gtf$type == "gene"] %>% data.frame %>% column_to_rownames('gene_id') %>% dplyr::select('gene_name')
# message ('-> GTF loaded!')

# add_Name <- function(res, gene2name){
#     res$ensembl <- res$name %>% as.character
#     res$name <- gene2name[res$ensembl %>% as.character,]
#     return (res)
# }

In [None]:
result_sig %>% filter(grepl('DAC',name))

In [None]:
result_all %>% filter(grepl('DAC',name))

In [None]:
# %%R
write.table(result_all %>% filter(grepl('DAC',name)), file = "radar_erv/result.all.txt",row.names=F, sep='\t', quote=FALSE) 
write.table(result_sig %>% filter(grepl('DAC',name)), file = "radar_erv/result.sig.txt",row.names=F, sep='\t',quote=FALSE) 

In [None]:
# cp -v radar/result.sig.txt hl60_delta_mtyl_table.txt 

In [None]:
# !cat hl60_delta_mtyl_table.txt | head 

### Save results into `bed12` format 
`tidyverse` can not be loaded while using RADAR!

In [None]:
# save bed 12 format 
names(result_all)[1] <- paste0("# ", names(result_all)[1])
write.table(result_all %>% dplyr::select(c(1:12)) %>% filter(grepl('DAC',name)), file = "radar_erv/result.all.bed",row.names=F, sep="\t", quote=FALSE) 

In [None]:
# save bed 12 format 
names(result_sig)[1] <- paste0("# ", names(result_sig)[1])
write.table(result_sig %>% dplyr::select(c(1:12)) %>% filter(grepl('DAC',name)), file = "radar/result.sig.bed",row.names=F, sep='\t',quote=FALSE)

In [None]:
sessionInfo()

In [None]:
!date