# Extract Nanopore current values from Tombo outputs

In [None]:
import numpy as np
import pandas as pd
import pickle
import h5py
import random
from tombo import tombo_helper
from tqdm.notebook import tqdm

In [None]:
# fast5 directories
fast5s = {'unm': 'path/to/fast5',
          'm6A': 'path/to/fast5',
          'vivo': 'path/to/fast5'
          }

# references
tombo_groups = {'unm': 'sams-3_E2E3L_and_sams-5_E2LE3L',
                'm6A': 'sams-3_E2E3L_and_sams-5_E2LE3L',
                'vivo': 'All_sams_transcripts',
                }

# the m6A site in the transcript
plotSites = {'sams-3_E2/E3L': 1262,
             'sams-3_retained': 1538,
             'sams-4_E2/E3L': 1265,
             'sams-4_E2L/E3L': 1388,
             'sams-4_retained': 1545,
             'sams-5_E2L/E3L': 1315
            }

In [None]:
# extract current data

# each sample
current = pd.DataFrame()
for sample_name, fast5_path in fast5s.items():
    
    # load tombo-annotated reads
    tmb = tombo_helper.TomboReads([fast5_path], corrected_group=tombo_groups[sample_name])
    
    
    # each transcript
    df2 = pd.DataFrame()
    for transcript, position in plotSites.items():
        
        # tracking
        print(transcript + ' in ' + sample_name)
        
        # retrieve reads on the sams transcript
        reads = tmb.get_cs_reads(chrm=transcript, strand='+')
        random.shuffle(reads)


        # each read
        df1 = []
        for read in tqdm(reads, desc='reads', leave=False):
            
            # the m6A site and +-50 nt
            plotStart = position - read.start - 50 - 1
            plotEnd = position - read.start + 50
            
            # check whether the read covers the 100 nt region
            if plotStart < 0 or read.end < (position + 50):
                
                continue
                
            # extract current data
            path = read.fn
            f5 = h5py.File(path, 'r')
            
            # mean, stdev and duration
            mean = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['norm_mean'][plotStart:plotEnd]
            stdev = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['norm_stdev'][plotStart:plotEnd]
            duration = f5['Analyses/' + tombo_groups[sample_name] + '/BaseCalled_template/Events'].value['length'][plotStart:plotEnd]
            df1.append(np.concatenate([mean, stdev, duration]))
            
        df1 = pd.DataFrame(df1)

        # label the sams transcript name
        df1['sams'] = transcript
        df2 = df2.append(df1)
        
    
    # label the sample name
    df2['RNA'] = sample_name
    current = current.append(df2)

    
# save extracted current data in a pickle file
with open('fast5_current_m6A_sams-345_100nt.pickle', 'wb') as f:
    pickle.dump(current, f)