In [1]:
import numpy as np
import pandas as pd
import pod5 as p5
import matplotlib.pyplot as plt

In [2]:
tsv_file = "egfp_a60_30_polyA_position.tsv"
pod_file = "egfp_a60_30.pod5"

polyA_df = pd.read_csv(tsv_file, sep='\t')
with p5.Reader(pod_file) as reader:
    pod_data = [(str(read.read_id), read.signal, read.pore) for read in reader.reads()]
pod_df = pd.DataFrame(pod_data, columns=["read_id","signal","pore"])
df = polyA_df.merge(pod_df, on="read_id")

In [9]:
df.iloc[0]

read_id                 0ff79b01-d806-48e5-b0cc-93a4e853d984
start                                                   2749
end                                                     4263
signal     [270, 255, 252, 225, 259, 251, 245, 249, 265, ...
pore          Pore(channel=480, well=2, pore_type='not_set')
Name: 0, dtype: object

In [34]:
def vectorize(start, end, signal, nwindows, window_size):
    first_quantile = int(start + (end - start) * 0.25)
    third_quantile = int(start + (end - start) * 0.75)
    
    signal50 = signal[first_quantile:third_quantile]
    signal50_mean = np.mean(signal50)
    normalized = signal / signal50_mean
    
    values = []

    for i in range(nwindows):
        x = i * ((end - start) / nwindows)
        
        if x.is_integer():
            x = int(x)
            start_index = max(int(start + x - (window_size // 2)), 0)
            end_index = min(int(start + x + (window_size // 2)), len(normalized))
        else:
            x = x // 1
            start_index = max(int(start + x - ((window_size / 2) - 1)), 0)
            end_index = min(int(start + x + (window_size / 2)), len(normalized))


        window_mean = np.mean(normalized[start_index:end_index])
        values.append(window_mean / window_size)

    return values


vectorize(df.iloc[0][1], df.iloc[0][2], df.iloc[0][3], 5000, 10)

  vectorize(df.iloc[0][1], df.iloc[0][2], df.iloc[0][3], 5000, 10)


[0.06862445122155567,
 0.06840364275935173,
 0.06840364275935173,
 0.06840364275935173,
 0.07152784759691828,
 0.07152784759691828,
 0.07152784759691828,
 0.07540374081645572,
 0.07540374081645572,
 0.07540374081645572,
 0.0791856729882468,
 0.0791856729882468,
 0.0791856729882468,
 0.0791856729882468,
 0.08433004035236012,
 0.08433004035236012,
 0.08433004035236012,
 0.08898111221580507,
 0.08898111221580507,
 0.08898111221580507,
 0.09325633988826455,
 0.09325633988826455,
 0.09325633988826455,
 0.09325633988826455,
 0.09739062598910449,
 0.09739062598910449,
 0.09739062598910449,
 0.09673289865487997,
 0.09673289865487997,
 0.09673289865487997,
 0.09851815856206084,
 0.09851815856206084,
 0.09851815856206084,
 0.09851815856206084,
 0.09943427877758787,
 0.09943427877758787,
 0.09943427877758787,
 0.10096114580346627,
 0.10096114580346627,
 0.10096114580346627,
 0.1008436944937833,
 0.1008436944937833,
 0.1008436944937833,
 0.100632282136354,
 0.100632282136354,
 0.100632282136354,
 

In [None]:
nwindows = 5000
window_size = 10
matrix = []

for i in range(len(df)):
    matrix.append(vectorize(df.iloc[i][1], df.iloc[i][2], df.iloc[i][3], nwindows, window_size))
print(matrix)

In [33]:
len(df)

291320