# TEISER

<!-- > ## Initializing the structural seeds
TEISER starts by evaluating a predefined set of structural motifs which we call seeds. The significant seeds are then further optimizied and elongated into more informative motifs. You can initialize the seed space as you see fit but we used the following criteria:

        stem length: from 4bp to 7bp.
        loop length" from 4nt to 9nt.
        Number of informative bases: 4nt to 6nt.
        Information of the motif: 14-20. -->

<!-- > This program creates the seeds that satisfies the set constraints and packages them into seperate files, each containing 250,000 independent seeds. We recommend using the seeds folder in the TEISER_Data directory to deposit the seed files. For example, for the above parameters, set the 'outfile' parameter to "$TEISERDIR/TEISER_Data/seeds/seeds.4-7.4-9.4-6.14". There is a file called "seedfiles.txt" in this folder which must contain all the generated files (with paths reported relative to TEISER home directory); if not, modify this file as needed. For each species, in the species_data folder, this file is set as a parameter, which enables TEISER to locate all the necessary seeds. -->



In [1]:
# %%bash 
# export TEISERDIR=/data_gilbert/home/aarab/Workflows/TEISERv1.1/

# $TEISERDIR/Programs/seed_creator \
#     -min_stem_length 4 \
#     -max_stem_length 7 \
#     -min_loop_length 4 \
#     -max_loop_length 9 \
#     -min_inf_seq 4 \
#     -max_inf_seq 6 \
#     -min_inf 14 \
#     -max_inf 20 \
#     -outfile TEISER_results/seed

## pyteiser
https://github.com/goodarzilab/pyteiser
> A framework for identifying the structural motifs that are informative of whole-genome measurements across all the transcripts



In [9]:
!head radar/result.sig.fa

>ENSG00000001084.13::chr6:53498437-53498486(-)
ACTCTGGAGCAACCTACTGTCTAAGCAGTTTTGTAAATGTACTGGTAAT
>ENSG00000001629.10::chr7:92398338-92398387(+)
CGATAGGCACTTCTTTACCTTCCAGGCTGGACTCTGTCCCCAGAAATAC
>ENSG00000003393.15::chr2:201757551-201757600(-)
TATAGTACAACCCCTTGTGAAACTGGAGCTCAGGCAGGCAGTAGTGCCA
>ENSG00000003393.15::chr2:201760946-201760995(-)
CAGAAACATACCATCATACCCTGACACCCAAGCAGTCAATGAATACCTA
>ENSG00000003393.15::chr2:201761345-201761394(-)
GCGAGTGGTGCTTCAAGTTGCCTGTGGTGCTTTCCACAGCTTAGCCCTT


### prepare inputs

> The names of these two columns have to be specified with the `--anno_name_column` and `--measur_column`

In [93]:
!echo -e "index\tpeak_name\tlogFC" > result.sig.txt.exp_values_file
!cat radar/result.sig.txt | awk 'NR>1{print 0"\t"$15"::"$1":"$2"-"$3"("$6")\t"$13}' >> result.sig.txt.exp_values_file

In [94]:
!head result.sig.txt.exp_values_file

index	peak_name	logFC
0	ENSG00000001084.13::chr6:53498437-53498486(-)	-1.09289798740744
0	ENSG00000001629.10::chr7:92398338-92398387(+)	-0.950192283583678
0	ENSG00000003393.15::chr2:201757551-201757600(-)	-0.830939338045323
0	ENSG00000003393.15::chr2:201760946-201760995(-)	-1.10279639519063
0	ENSG00000003393.15::chr2:201761345-201761394(-)	1.57365896834208
0	ENSG00000003402.20::chr2:201129982-201130031(+)	-1.64690884386422
0	ENSG00000004142.12::chr17:28347325-28347374(-)	-1.12467239533797
0	ENSG00000004142.12::chr17:28347523-28347572(-)	1.94591014888379
0	ENSG00000004487.17::chr1:23083490-23083539(+)	-0.885893254689784


___

### 1. Generate seeds

In [121]:
mkdir /data_gilbert/home/aarab/Workflows/pyteiser_files/seeds

In [None]:
!python /data_gilbert/home/aarab/Workflows/pyteiser_files/pyteiser/pyteiser/seeds_generator.py \
    --prefix seed \
    --outfolder ~/Workflows/pyteiser_files/seeds/

In [134]:
!ls -l ~/Workflows/pyteiser_files/seeds  | wc -l

683094


### 2. Convert sequences from fasta to binary format

In [151]:
!python /data_gilbert/home/aarab/Workflows/pyteiser_files/pyteiser/pyteiser/wrappers/binarize_sequences.py \
    --rna_fastafile radar/result.sig.fa \
    --rna_bin_file radar/result.sig.bin
    
#     --prefix seed \
#     --outfolder ~/Workflows/pyteiser_files/seeds/

Read sequence number  1000
Read sequence number  2000
Read sequence number  3000
Read sequence number  4000
Compressed sequence number  0
Compressed sequence number  1000
Compressed sequence number  2000
Compressed sequence number  3000
Compressed sequence number  4000


### 3. Precalculate seed occurence profiles
Use `pyteiser/wrappers/calculate_seed_profiles.py` - run on HPC!


In [155]:
!python /data_gilbert/home/aarab/Workflows/pyteiser_files/pyteiser/pyteiser/wrappers/calculate_seed_profiles.py 
--task_mapping_file \
--seed_folder ~/Workflows/pyteiser_files/seeds/--rna_bin_file \
--out_folder 
--inp_filename_template
--out_filename_template
--print_qstat
--path_to_qstat
--are_seeds_degenerate
--indices_mode
--index_bit_width

#     --rna_fastafile radar/result.sig.fa \
#     --rna_bin_file radar/result.sig.bin
    
# #     --prefix seed \
# #     --outfolder ~/Workflows/pyteiser_files/seeds/

import numpy as np
import argparse
import os

from pyteiser import IO
from pyteiser import matchmaker
from pyteiser import type_conversions
from pyteiser import sge

def handler():
    parser = argparse.ArgumentParser()

    parser.add_argument("--task_mapping_file", help="", type=str)

    parser.add_argument("--seed_folder", help="", type=str)
    parser.add_argument("--rna_bin_file", help="", type=str)
    parser.add_argument("--out_folder", help="", type=str)
    parser.add_argument("--inp_filename_template", help="", type=str)
    parser.add_argument("--out_filename_template", help="", type=str)
    parser.add_argument("--print_qstat", help="", type=str)
    parser.add_argument("--path_to_qstat", help="", type=str)
    parser.add_argument("--are_seeds_degenerate", help="", type=str)
    parser.add_argument("--indices_mode", help="compression in the index mode", type=bool)
    parser.add_argument("--index_bit_width", help="number of bits per one index when compressing", type=int)



### run `pyteiser_pipeline`

In [152]:
!pyteiser_pipeline \
    --rna_fastafile radar/result.sig.fa \
    --exp_values_file result.sig.txt.exp_values_file \
    --anno_name_column peak_name \
    --measur_column logFC \
    --seeds_file ~/Workflows/pyteiser_files/seeds/

Read sequence number  1000
Read sequence number  2000
Read sequence number  3000
Read sequence number  4000
Compressed sequence number  0
Compressed sequence number  1000
Compressed sequence number  2000
Compressed sequence number  3000
Compressed sequence number  4000
Traceback (most recent call last):
  File "/data_gilbert/home/aarab/anaconda3/envs/pyteiser/bin/pyteiser_pipeline", line 5, in <module>
    pyteiser_pipeline.main()
  File "/data_gilbert/home/aarab/anaconda3/envs/pyteiser/lib/python3.10/site-packages/pyteiser/wrappers/pyteiser_pipeline.py", line 195, in main
    calculate_seed_profiles.non_sge_dependent_main(
  File "/data_gilbert/home/aarab/anaconda3/envs/pyteiser/lib/python3.10/site-packages/pyteiser/wrappers/calculate_seed_profiles.py", line 116, in non_sge_dependent_main
    n_motifs_list, n_seqs_list = read_input_files(seeds_filename_full, rna_bin_filename)
  File "/data_gilbert/home/aarab/anaconda3/envs/pyteiser/lib/python3.10/site-packages/pyteiser/wrappers/calcul

# iPAGE 
https://github.com/goodarzilab/PAGE  

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv('radar/result.all.txt',sep='\t').loc[:,['name','logFC','p_value']]#.set_index('ensembl')

https://github.com/goodarzilab/server_wiki/tree/main/lab_softwares

In [3]:
%reload_ext rpy2.ipython

In [4]:
%%R -i df -o data
data = df
data[['fdr']] <- sign(data[,'logFC'])*(1-data[,'p_value'])
data <- data[!is.na(data$'fdr'), ]
data <- data[, c('name', 'fdr','logFC','p_value')]

Select max logFC for each gene 

In [5]:
%%time 
data = data.iloc[[int(data[(data.name == gene)].logFC.idxmax()) for gene in set(data.name)],:]

CPU times: user 20.4 s, sys: 54.3 ms, total: 20.5 s
Wall time: 20.5 s


In [7]:
# data.plot(x='logFC', y='fdr', kind='scatter')
# data.plot(x='logFC', y='p_value', kind='scatter')
# data.plot(x='fdr', y='p_value', kind='scatter')
# plt.show()

In [33]:
data[['name','fdr']].to_csv('hl60_delta_mtyl_fdr.txt',sep='\t',index=None, header=None)
data[['name','logFC']].to_csv('hl60_delta_mtyl.txt',sep='\t',index=None, header=None)

In [7]:
%%bash 
export PAGEDIR=/data_gilbert/home/aarab/iPAGE

touch ipage.out; rm ipage.out; touch ipage.out; 

nohup ls hl60_delta_mtyl.txt \
    | parallel -j8 -k bash ~/Projects/pager/ipage_loop.sh {} \
    &>> ipage.out & 

nohup ls hl60_delta_mtyl_fdr.txt \
    | parallel -j8 -k bash ~/Projects/pager/ipage_loop.sh {} \
    &>> ipage.out & 

In [None]:
!cat ipage.out

___
# 

In [1]:
import sys
import pandas as pd 
import numpy as np
from time import time 
from glob import glob
# from itertools import chain
# from functools import reduce
# import matplotlib.pyplot as plt
# from matplotlib.pyplot import subplots
# from venn import venn

pager_dir = '/data_gilbert/home/aarab/Projects/pager/'
# pager_dir = '/rumi/shams/abe/Projects/ipage-down/'
sys.path.append(pager_dir)

import ipage_down as ipd

In [2]:
def get_pvmatrix_list(parent_path,pattern):
    """
    pattern: msigdb gene set cluster name 
    """
    return glob(f'{parent_path}/*{pattern}*/pvmatrix.txt')

In [3]:
cols = pd.concat([
    ipd.pvmat2bio_signal(ipd.merge_multiple_pvmat(
        get_pvmatrix_list("hl60_delta_mtyl_fdr","c2")
    ),s,n_clust=1) for s in ['up','both','down']
]).columns

In [4]:
pv_signal = pd.concat([
    ipd.pvmat2bio_signal(ipd.merge_multiple_pvmat(
        get_pvmatrix_list("hl60_delta_mtyl_fdr","c2.cp.kegg") + \

        get_pvmatrix_list("hl60_delta_mtyl","c5.go.") + \
        get_pvmatrix_list("hl60_delta_mtyl_fdr","c5.go.") #+ \

    ),s,n_clust=1) for s in ['up','both','down']
])

pv_signal.columns = cols
pv_signal=pv_signal[~pv_signal.duplicated()]

pv_signal.to_csv('RNA-mtyl-geneset-enrichment-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    'hl60_delta_mtyl.txt' \
    'RNA-mtyl-geneset-enrichment-pvmatrix.txt' \
    'RNA-mtyl-geneset-enrichment-GOandKEGG.pdf' &> /dev/null

!rm -v 'RNA-mtyl-geneset-enrichment-pvmatrix.txt'

removed ‘RNA-mtyl-geneset-enrichment-pvmatrix.txt’


In [5]:
pv_signal = pd.concat([
    ipd.pvmat2bio_signal(ipd.merge_multiple_pvmat(
        get_pvmatrix_list("hl60_delta_mtyl","c5.go.") + \
        get_pvmatrix_list("hl60_delta_mtyl_fdr","c5.go.") #+ \

    ),s,n_clust=1) for s in ['up','both','down']
])

pv_signal.columns = cols
pv_signal=pv_signal[~pv_signal.duplicated()]

pv_signal.to_csv('RNA-mtyl-geneset-enrichment-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    'hl60_delta_mtyl.txt' \
    'RNA-mtyl-geneset-enrichment-pvmatrix.txt' \
    'RNA-mtyl-geneset-enrichment-GO.pdf' &> /dev/null

!rm -v 'RNA-mtyl-geneset-enrichment-pvmatrix.txt'

removed ‘RNA-mtyl-geneset-enrichment-pvmatrix.txt’


In [149]:
pv_signal = pd.concat([
    ipd.pvmat2bio_signal(ipd.merge_multiple_pvmat(
        get_pvmatrix_list("hl60_delta_mtyl","c3.mir") + \
        get_pvmatrix_list("hl60_delta_mtyl_fdr","c3.mir")
    ),s,n_clust=1) for s in ['up','both','down']
])

pv_signal.columns = cols
pv_signal=pv_signal[~pv_signal.duplicated()]

pv_signal.to_csv('RNA-mtyl-geneset-enrichment-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    'hl60_delta_mtyl.txt' \
    'RNA-mtyl-geneset-enrichment-pvmatrix.txt' \
    'RNA-mtyl-geneset-enrichment-miR.pdf' &> /dev/null

!rm -v 'RNA-mtyl-geneset-enrichment-pvmatrix.txt'

removed ‘RNA-mtyl-geneset-enrichment-pvmatrix.txt’


In [150]:
pv_signal = pd.concat([
    ipd.pvmat2bio_signal(ipd.merge_multiple_pvmat(
        get_pvmatrix_list("hl60_delta_mtyl","c1") + \
        get_pvmatrix_list("hl60_delta_mtyl_fdr","c1")
    ),s,n_clust=1) for s in ['up','both','down']
])

pv_signal.columns = cols
pv_signal=pv_signal[~pv_signal.duplicated()]

pv_signal.to_csv('RNA-mtyl-geneset-enrichment-pvmatrix.txt',sep='\t')

!bash {pager_dir}/ipage_draw_matrix.sh \
    'hl60_delta_mtyl.txt' \
    'RNA-mtyl-geneset-enrichment-pvmatrix.txt' \
    'RNA-mtyl-geneset-enrichment-Chr.pdf' &> /dev/null

!rm -v 'RNA-mtyl-geneset-enrichment-pvmatrix.txt'

removed ‘RNA-mtyl-geneset-enrichment-pvmatrix.txt’


### GOBP_HISTONE_H3_K36_METHYLATION

In [6]:
ipd.detect_gs_cluster(get_pvmatrix_list("hl60_delta_mtyl_fdr","c"),"GOBP_HISTONE_H3_K36_METHYLATION")

['hl60_delta_mtyl_fdr/msigdb_v7.4_c5.go/pvmatrix.txt',
 'hl60_delta_mtyl_fdr/msigdb_v7.4_c5.go.bp/pvmatrix.txt',
 'hl60_delta_mtyl_fdr/msigdb_v7.4_c5.all/pvmatrix.txt']

In [7]:
[gene for clust in ['8','9','10'] for gene in list(ipd.bin_identifier_genes(
    "hl60_delta_mtyl_fdr/msigdb_v7.4_c5.go.bp",clust,'GOBP_HISTONE_H3_K36_METHYLATION'
).values())[0]]

['SETD5', 'SETMAR', 'BCOR', 'NSD2', 'SETD2', 'PAXIP1', 'NSD1', 'ASH1L', 'BRD4']

### KEGG_LYSINE_DEGRADATION

In [8]:
ipd.detect_gs_cluster(get_pvmatrix_list("hl60_delta_mtyl_fdr","c"),"KEGG_LYSINE_DEGRADATION")

['hl60_delta_mtyl_fdr/msigdb_v7.4_c2.cp.kegg/pvmatrix.txt']

In [9]:
[gene for clust in ['8','9','10'] for gene in list(ipd.bin_identifier_genes(
    "hl60_delta_mtyl_fdr/msigdb_v7.4_c2.cp.kegg",clust,'KEGG_LYSINE_DEGRADATION'
).values())[0]]

['DOT1L',
 'KMT5A',
 'OGDH',
 'SETMAR',
 'GCDH',
 'SETDB2',
 'KMT5B',
 'ALDH1B1',
 'SETD1B',
 'ECHS1',
 'NSD2',
 'SETD2',
 'NSD1',
 'SETD1A',
 'ASH1L',
 'EHMT1']

___
# 

In [37]:
!date

Mon Jan  3 15:13:38 PST 2022
