In [12]:
# Martin Konečnik, https://git.siwim.si/machine-learning/fix-qa-binary-classification
# Notebook intended for prototyping binary classification script
import tomllib
from pathlib import Path

import numpy as np
import torch
from sklearn.model_selection import train_test_split
from swm import factory
from torch.utils.data import DataLoader, TensorDataset


In [13]:
# Read the configuration file.
with open('conf.toml', 'rb') as f:
    conf = tomllib.load(f)

EVENTS_PATH = Path().home() / conf['data_dir'] / 'prepared'
INDEX = conf['channel']

In [14]:
# Build a list of training data.
signals_unaltered = []
signals_corrected = []
binary_labels = []
for event in (EVENTS_PATH / 'unaltered' / '0').iterdir():
    data = factory.read_file(event)
    signals_unaltered.append(data.acqdata.a[INDEX].data)

for event in (EVENTS_PATH / 'corrected' / '0').iterdir():
    data = factory.read_file(event)
    signals_corrected.append(data.acqdata.a[INDEX].data)

signals = signals_unaltered + signals_corrected
binary_labels = np.array([0] * len(signals_unaltered) + [1] * len(signals_corrected))

max_length = max(len(signal) for signal in signals)

signals = np.array([np.pad(signal, (0, max_length - len(signal))) for signal in signals])
binary_labels = np.array(binary_labels)

print(f'Original signals shape: {signals.shape}')
print(f'Binary labels shape: {binary_labels.shape}')

Original signals shape: (61501, 0)
Binary labels shape: (61501,)


In [15]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(signals, binary_labels, test_size=0.2, stratify=binary_labels)  # Stratify makes sure the random split preserves ratio of classes.

In [16]:
# # Convert to TF Dataset for better performance.
# train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
# val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))
#
# # Prepare for training.
# batch = 32
# train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(batch).prefetch(tf.data.AUTOTUNE)
# val_dataset = val_dataset.shuffle(buffer_size=len(X_train)).batch(batch).prefetch(tf.data.AUTOTUNE)
#
# # Sanity check.
# sample_X, sample_y = next(iter(train_dataset))
# print(f'Final training batch - X shape: {sample_X.shape}, y shape: {sample_y.shape}')
# print(f'X dtype: {sample_X.dtype}, y dtype: {sample_y.dtype}')  # Should be float32 and int.

In [18]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train).float()  # float32 for features
y_train_tensor = torch.from_numpy(y_train).long()  # int64 for labels
X_val_tensor = torch.from_numpy(X_val).float()
y_val_tensor = torch.from_numpy(y_val).long()

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Prepare DataLoaders
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# Sanity check
sample_X, sample_y = next(iter(train_loader))
print(f'Final training batch - X shape: {sample_X.shape}, y shape: {sample_y.shape}')
print(f'X dtype: {sample_X.dtype}, y dtype: {sample_y.dtype}')  # Should be float32 and int.

Final training batch - X shape: torch.Size([32, 0]), y shape: torch.Size([32])
X dtype: torch.float32, y dtype: torch.int64
