# Creating a Databunch for Basecalling

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from fastai.basics import *

import jkbc.utils.preprocessing as prep
import jkbc.utils.files as f

In [3]:
RANGE_FROM     = 0
RANGE_TO       = 100
FIX_LABEL_LEN  = 60  # Needed to avoid issues with jacked arrays
BLANK_ID       = 4
FOLDERPATH     = 'feather-files/'
FOLDERNAME     = f'{FOLDERPATH}Range{RANGE_FROM}-{RANGE_TO}-FixLabelLen{FIX_LABEL_LEN}'

## Load Data

In [4]:
base_dir = "/mnt/sdb/taiyaki_mapped/"
path_data = Path(base_dir)

data_set_name = 'small_umi16to9.hdf5'
collection = prep.SignalCollection(path_data/data_set_name)

## Preprocess Data

In [5]:
def get_range(collection: prep.SignalCollection, ran: range)-> Tuple[np.ndarray, np.ndarray]:
    x = None
    y = None
    for i in ran:
        # Getting data
        data = collection[i]
        data_fields = np.array(data.x), np.array(data.y), data.reference
        _x, _y, _ = data_fields # we don't use the full reference while training

        # Concating into a single collection
        x = _x if x is None else np.concatenate((x, _x))
        y = _y if y is None else np.concatenate((y, _y))
    
    # Adding padding
    y_padded = prep.add_label_padding(labels = y, fixed_label_len = FIX_LABEL_LEN, padding_id = BLANK_ID)
    
    return (x, y_padded)

In [6]:
# Get data range
data = get_range(collection, range(RANGE_FROM, RANGE_TO));
x, y = data 

Processing 0000b441-4cc6-42a7-bdfe-310f9e560e57 (0)
Processing 00016f22-28b9-4c1a-93ac-da73e66bab27 (1)
Processing 0001e785-de10-4ec3-80d7-310d6ce0af3e (2)
Processing 0002e2a3-d3b5-417b-8df6-2bb246c246e2 (3)
Processing 000367d6-8c24-4a35-8f26-9af6f3fdc513 (4)
Processing 0003854d-8576-4199-b7f9-5effedad36a8 (5)
Processing 0003ae19-5d14-4935-960b-747157bc58bc (6)
Processing 0003bddb-4bdf-4888-a31a-a131a846ff65 (7)
Processing 0004d0df-be8d-4302-a0f4-63334422e561 (8)
Processing 00054cec-632d-4754-b610-afe3743b51f8 (9)
Processing 00055b21-193e-4c2c-953a-74cf189f248f (10)
Processing 0005808e-0f97-4abe-ab57-68553255f980 (11)
Processing 0005ff17-fcb5-443a-9a0f-5a854e3c1f30 (12)
Processing 0006f86c-9a4a-47b0-b6c0-390c790fe0c4 (13)
Processing 0007dd8d-14bc-4385-a541-9d799075688e (14)
Processing 0009d90b-63d9-4688-9c29-822eb34481be (15)
Processing 000a9c32-a6f8-48d2-833b-025123d28d09 (16)
Processing 000aa87c-7bce-4fe4-9c00-6389a12904c1 (17)
Processing 000ba577-f903-4f26-8104-143cfce0fc52 (18)
Pro

In [7]:
f.write_data_to_feather_file(FOLDERNAME, data)

## TEST
Used when changes are made to test that `data > write > read == data`

In [8]:
x_, y_ = f.read_data_from_feather_file(FOLDERNAME)

In [9]:
assert x_.shape == x.shape
assert y_.shape == y.shape
assert x_.dtype == x.dtype
assert y_.dtype == y.dtype

In [10]:
def equal_sum(a, b):
    assert sum([sum(x) for x in a]) == sum([sum(x) for x in b])

In [11]:
equal_sum(x, x_), equal_sum(y, y_);