In [1]:
# Martin Konečnik, http://git.siwim.si/machine-learning/fix-qa-binary-classification
# Notebook intended for prototyping binary classification script
import argparse
import struct
import sys
import time
import tomllib
from collections import Counter
from logging import DEBUG, INFO, WARNING, getLogger
from pathlib import Path
from typing import Dict

import numpy as np
import tensorflow as tf
import torch
from cestel_helpers.console import configure_all
from cestel_helpers.log import init_logger
from cestel_helpers.version import get_version
from lxml import etree
from sklearn.model_selection import train_test_split
from swm import factory
from torch.utils.data import DataLoader, TensorDataset


2025-05-07 13:43:11.264513: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-07 13:43:11.401223: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746618191.474989   28230 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746618191.495156   28230 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746618191.633065   28230 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [11]:
# Read the configuration file.
with open('conf.toml', 'rb') as f:
    conf = tomllib.load(f)

UNALTERED_PATH = Path(conf['unaltered'])
CORRECTED_PATH = Path(conf['corrected'])
INDEX = conf['channel']

In [12]:
# Get dicts of original and altered events with their size.
is_changed: Dict[str, bool] = {}
for unaltered_file in UNALTERED_PATH.glob('*.event'):
    corrected_file = (CORRECTED_PATH / unaltered_file.name).exists()
    if corrected_file.exists():
        if unaltered_file.read_bytes() == corrected_file.read_bytes():
            is_changed[unaltered_file.stem] = False
        else:
            is_changed[unaltered_file.stem] = True
    else:
        print(f'{unaltered_file.name} missing!')

print(f'Unaltered: {Counter(is_changed.values())[True]}')
print(f'Corrected: {Counter(is_changed.values())[False]}')

Unaltered: 0
Corrected: 28822


In [6]:
# Build a list of training data.
signals = []
binary_labels = []
for ets, status in is_changed.items():
    path = UNALTERED_PATH / f'{ets}.event'
    try:
        data = factory.read_file(path)
        signals.append(data.acqdata.a[16].data)
        binary_labels.append(0 if status else 1)
    except struct.error:
        pass

max_length = max(len(signal) for signal in signals)

signals = np.array([np.pad(signal, (0, max_length - len(signal)), 'constant') for signal in signals])
binary_labels = np.array(binary_labels)

print(f'Original signals shape: {signals.shape}')
print(f'Binary labels shape: {binary_labels.shape}')

Original signals shape: (37882, 4959)
Binary labels shape: (37882,)


In [7]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(signals, binary_labels, test_size=0.2, stratify=binary_labels)  # Stratify makes sure the random split preserves ratio of classes.

In [8]:
# Convert to TF Dataset for better performance.
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train))
val_dataset = tf.data.Dataset.from_tensor_slices((X_val, y_val))

# Prepare for training.
batch = 32
train_dataset = train_dataset.shuffle(buffer_size=len(X_train)).batch(batch).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.shuffle(buffer_size=len(X_train)).batch(batch).prefetch(tf.data.AUTOTUNE)

# Sanity check.
sample_X, sample_y = next(iter(train_dataset))
print(f'Final training batch - X shape: {sample_X.shape}, y shape: {sample_y.shape}')
print(f'X dtype: {sample_X.dtype}, y dtype: {sample_y.dtype}')  # Should be float32 and int.

I0000 00:00:1746537976.250255    4196 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 22348 MB memory:  -> device: 0, name: NVIDIA RTX 4500 Ada Generation, pci bus id: 0000:01:00.0, compute capability: 8.9


Final training batch - X shape: (32, 4959), y shape: (32,)
X dtype: <dtype: 'float32'>, y dtype: <dtype: 'int64'>


In [9]:
# Convert numpy arrays to PyTorch tensors
X_train_tensor = torch.from_numpy(X_train).float()  # float32 for features
y_train_tensor = torch.from_numpy(y_train).long()  # int64 for labels
X_val_tensor = torch.from_numpy(X_val).float()
y_val_tensor = torch.from_numpy(y_val).long()

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

# Prepare DataLoaders
batch_size = 32
train_loader = DataLoader(
    train_dataset,
    batch_size=batch_size,
    shuffle=True,
    num_workers=2,
    pin_memory=True
)
val_loader = DataLoader(
    val_dataset,
    batch_size=batch_size,
    shuffle=False,
    num_workers=2,
    pin_memory=True
)

# Sanity check
sample_X, sample_y = next(iter(train_loader))
print(f'Final training batch - X shape: {sample_X.shape}, y shape: {sample_y.shape}')
print(f'X dtype: {sample_X.dtype}, y dtype: {sample_y.dtype}')  # Should be float32 and int.

Final training batch - X shape: torch.Size([32, 4959]), y shape: torch.Size([32])
X dtype: torch.float32, y dtype: torch.int64
