# Extract Nanopore current values from Tombo outputs

In [13]:
import numpy as np
import pandas as pd
import pickle
import h5py
import random
from tombo import tombo_helper
from tqdm.notebook import tqdm

In [2]:
# fast5 directories
fast5s = {'unm': 'path/to/fast5',
          'm6A': 'path/to/fast5',
          'vivo': 'path/to/fast5'
          }

# references
tombo_groups = {'unm': 'sams-3_E2E3L_and_sams-5_E2LE3L',
                'm6A': 'sams-3_E2E3L_and_sams-5_E2LE3L',
                'vivo': 'All_sams_transcripts',
                }

# the m6A site in the transcript
plotSites = {'sams-3 E2/E3L': 1262,
             'sams-3 retained': 1538,
             'sams-4 E2/E3L': 1265,
             'sams-4 E2L/E3L': 1388,
             'sams-4 retained': 1545,
             'sams-5 E2L/E3L': 1315
            }

In [15]:
# extract current data

# each sample
current = pd.DataFrame()
for sample_name, fast5_path in fast5s.items():
    
    # load tombo-annotated reads
    tmb = tombo_helper.TomboReads([fast5_path], corrected_group=tombo_groups[sample_name])
    
    
    # each transcript
    df2 = pd.DataFrame()
    for transcript, position in plotSites.items():
        
        # tracking
        print(transcript + ' in ' + sample_name)
        
        # retrieve reads on the sams transcript
        reads = tmb.get_cs_reads(chrm=transcript, strand='+')
        random.shuffle(reads)


        # each read
        df1 = []
        for read in tqdm(reads, desc='reads', leave=False):
            
            # the m6A site and +-50 nt
            plotStart = position - read.start - 50 - 1
            plotEnd = position - read.start + 50
            
            # check whether the read covers the 100 nt region
            if plotStart < 0 or read.end < (position + 50):
                
                continue
                
            # extract current data
            path = read.fn
            f5 = h5py.File(path, 'r')
            
            # mean, stdev and duration
            mean = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['norm_mean'][plotStart:plotEnd]
            stdev = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['norm_stdev'][plotStart:plotEnd]
            duration = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['length'][plotStart:plotEnd]
            df1.append(np.concatenate([mean, stdev, duration]))
            
        df1 = pd.DataFrame(df1)

        # label the sams transcript name
        df1['sams'] = transcript
        df2 = df2.append(df1)
        
    
    # label the sample name
    df2['RNA'] = sample_name
    current = current.append(df2)

    
# save extracted current data in a pickle file
with open('fast5_current_m6A_sams-345_100nt.pickle', 'wb') as f:
    pickle.dump(current, f)

[07:00:38] Parsing Tombo index file(s).


sams-3b in unm


HBox(children=(IntProgress(value=0, description='reads', max=7954, style=ProgressStyle(description_width='init…

sams-3c in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4b in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4c in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4d in unm


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-5b in unm


HBox(children=(IntProgress(value=0, description='reads', max=13736, style=ProgressStyle(description_width='ini…

[07:03:23] Parsing Tombo index file(s).


sams-3b in m6A


HBox(children=(IntProgress(value=0, description='reads', max=3295, style=ProgressStyle(description_width='init…

sams-3c in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4b in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4c in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-4d in m6A


HBox(children=(IntProgress(value=1, bar_style='info', description='reads', max=1, style=ProgressStyle(descript…

sams-5b in m6A


HBox(children=(IntProgress(value=0, description='reads', max=5376, style=ProgressStyle(description_width='init…

[07:05:09] Parsing Tombo index file(s).


sams-3b in vivo


HBox(children=(IntProgress(value=0, description='reads', max=110, style=ProgressStyle(description_width='initi…

sams-3c in vivo


HBox(children=(IntProgress(value=0, description='reads', max=29, style=ProgressStyle(description_width='initia…

sams-4b in vivo


HBox(children=(IntProgress(value=0, description='reads', max=49, style=ProgressStyle(description_width='initia…

sams-4c in vivo


HBox(children=(IntProgress(value=0, description='reads', max=58, style=ProgressStyle(description_width='initia…

sams-4d in vivo


HBox(children=(IntProgress(value=0, description='reads', max=32, style=ProgressStyle(description_width='initia…

sams-5b in vivo


HBox(children=(IntProgress(value=0, description='reads', max=18, style=ProgressStyle(description_width='initia…

[07:05:26] Parsing Tombo index file(s).


sams-3b in unm long


HBox(children=(IntProgress(value=0, description='reads', max=128267, style=ProgressStyle(description_width='in…

sams-3c in unm long


HBox(children=(IntProgress(value=0, description='reads', max=58852, style=ProgressStyle(description_width='ini…

sams-4b in unm long


HBox(children=(IntProgress(value=0, description='reads', max=53515, style=ProgressStyle(description_width='ini…

sams-4c in unm long


HBox(children=(IntProgress(value=0, description='reads', max=53504, style=ProgressStyle(description_width='ini…

sams-4d in unm long


HBox(children=(IntProgress(value=0, description='reads', max=59187, style=ProgressStyle(description_width='ini…

sams-5b in unm long


HBox(children=(IntProgress(value=0, description='reads', max=5, style=ProgressStyle(description_width='initial…