## RNAme analysis

In [1]:
import pandas as pd

In [2]:
def read_fasta(path):
    file = open(path)
    lines = file.read().splitlines()
    ids = [s[1:].split('::')[0] for s in lines if '>' in s]
    n = [i for i,s in enumerate(lines) if '>' in s]
    n.append(len(lines))
    sequences = [''.join(lines[i+1:j]) for i,j in zip(n[:-1],n[1:])]
    file.close()
    fa = dict(zip(ids, sequences))
    return fa

In [3]:
def radar_results_to_bed_and_fa(path):
    df = pd.read_csv(f'{path}/result.sig.txt',sep='\t')
    bed_file = f'{path}/peak.bed'
    fa_file = f'{path}/peak.fa'
    
    df.iloc[:,:12].to_csv(bed_file,sep='\t',index=False,header=False)
    
    !sort -k1,1 -k2,2n {bed_file} | \
    bedtools getfasta -name -s -fi /data_gilbert/home/aarab/genomes/hg38/hg38.fa \
    -bed - -split -fo {fa_file}

In [11]:
with pd.ExcelWriter("hERV-hyper-m6A-peaks.xlsx") as writer:
   
    # use to_excel function and specify the sheet_name and index 
    # to store the dataframe in specified sheet
    for name in ['line','sine','retroposon']:
        df = pd.read_csv(f'meRIP-seq/radar_herv/{name}/result.sig.txt',sep='\t')
        df.query('logFC > 0').to_excel(writer, sheet_name=name, index=False)

#### LINEs

In [74]:
radar_results_to_bed_and_fa('meRIP-seq/radar_herv/line')

In [102]:
fa = read_fasta('meRIP-seq/radar_herv/line/peak.fa')

In [91]:
df = pd.read_csv('meRIP-seq/radar_herv/line/result.sig.txt',sep='\t')

In [107]:
[(n, fa[n]) for n in df[df.logFC.gt(20)].name.to_list()]

[('1011737:LINE:L1P1_orf2',
  'ttgaaaccaatgagaacaaagacacaacataccagaatctctgggacaca'),
 ('2172981:LINE:L1P1_orf2',
  'ttgaaaccaatgagaacaaagacacaacataccagaatctctgggacaca'),
 ('2350540:LINE:L1MA10_3end',
  'acaggtgcttgaaggcagcatgctcgttaagagtcatcaccactccctaa'),
 ('3429068:LINE:L1PA4_3end',
  'atatccagaatctacaatgaactcaaacaagcttacaagaaaaaaacaaa')]

#### SINEs

In [108]:
radar_results_to_bed_and_fa('meRIP-seq/radar_herv/sine')

In [109]:
fa = read_fasta('meRIP-seq/radar_herv/sine/peak.fa')

In [110]:
df = pd.read_csv('meRIP-seq/radar_herv/sine/result.sig.txt',sep='\t')

In [111]:
[(n, fa[n]) for n in df[df.logFC.gt(20)].name.to_list()]

[('1964087:SINE:AluSx',
  'ctgggtgacaagcaagactccgtctcaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'),
 ('3275173:SINE:AluJb',
  'tccagcttgggtgacggagaggagcgagaccctgtctcaaaaaaaaaaaaaaa'),
 ('409648:SINE:AluJb', 'agccaggagtttgagatcagcctaggcaacaaatgagactccgtctctta'),
 ('491781:SINE:AluSx', 'aactccgtctctaataaaaatacaaaaaaattagctggatgtggtggcgt'),
 ('870057:SINE:AluSx', 'gaaaccccatctccctactaaaaatacaaaaatgggctgggcgcagtggc')]

### retroposon

In [114]:
radar_results_to_bed_and_fa('meRIP-seq/radar_herv/retroposon')

In [115]:
fa = read_fasta('meRIP-seq/radar_herv/retroposon/peak.fa')

In [116]:
df = pd.read_csv('meRIP-seq/radar_herv/retroposon/result.sig.txt',sep='\t')

In [117]:
[(n, fa[n]) for n in df[df.logFC.gt(20)].name.to_list()]

[('1459712:Retroposon:SVA_D',
  'acagatgcttgaaggcagcatgctccttaagagtcatcaccactccctaa'),
 ('1851595:Retroposon:SVA_D',
  'ccactccctaatctttaagtacccagggacacaaacactgcggaaggccg'),
 ('1971872:Retroposon:SVA_F',
  'gcttgaaggcagcatgctcgttaagagtcatcaccactccctaatctcaa'),
 ('2350541:Retroposon:SVA_D',
  'aacaggtgcttgaaggcagcatgctcgttaagagtcatcaccactcccta'),
 ('421041:Retroposon:SVA_D',
  'ttaaacagatgcttgaaggcagcatgctccttaagagtcatcaccactcc'),
 ('740219:Retroposon:SVA_B',
  'ggggagcgcctctgccccgccgccccgtctgagatgtgaggagcgcctct')]

## Load RNA-seq data