Removes custom "label smoothing" from datasets

In [None]:
import time
import sys
import os
import glob
import math
import threading
import concurrent.futures as cf

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Input, Model, layers, metrics, losses, callbacks, optimizers, models, utils
from keras import backend as K
import gc
import keras_tuner as kt
from pyfaidx import Fasta

K.clear_session()
gc.collect()

datasets_path = "../../Datasets/"
models_path = "../../Models/"

2025-03-07 23:48:20.287350: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-07 23:48:20.470426: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-07 23:48:20.524085: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-07 23:48:20.897344: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


0

In [None]:
def convert_labels_to_binary(x, y):
    """
    Converts y so that any value not exactly 1 becomes 0.
    Both x and y are expected to be tensors of shape (chunk_size, 5).
    """
    y_binary = tf.cast(tf.equal(y, 1.0), y.dtype)
    return x, y_binary

def _float_feature(value):
    """Returns a float_list from a list of floats."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """Returns an int64_list from a list of ints."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
    """Returns a bytes_list from a string (or byte string)."""
    if isinstance(value, str):
        value = value.encode('utf-8')
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize_example_with_metadata(x, y, record_id, cstart, cend, strand):
    """
    Serializes a single example into a tf.train.Example.
    x and y are tensors of shape (chunk_size, 5). The labels in y are converted
    so that only exact 1's remain 1 and all other values become 0.
    The metadata is also stored.
    """
    # First, convert labels to binary.
    x, y_binary = convert_labels_to_binary(x, y)
    
    # Flatten the X and y tensors into lists.
    x_flat = tf.reshape(x, [-1]).numpy().tolist()
    y_flat = tf.reshape(y_binary, [-1]).numpy().tolist()
    
    # Determine chunk_size from the first dimension of x.
    chunk_size = int(x.shape[0])
    
    # Convert metadata to Python types.
    # record_id and strand are assumed to be scalars (strings).
    record_id_val = record_id.numpy() if isinstance(record_id, tf.Tensor) else record_id
    strand_val = strand.numpy() if isinstance(strand, tf.Tensor) else strand

    # cstart and cend are tensors of shape [1].
    cstart_val = cstart.numpy() if isinstance(cstart, tf.Tensor) else cstart
    cend_val   = cend.numpy()   if isinstance(cend, tf.Tensor)   else cend
    # Ensure we extract the integer from a one-element array.
    cstart_int = int(cstart_val[0]) if isinstance(cstart_val, (list, tuple, np.ndarray)) else int(cstart_val)
    cend_int   = int(cend_val[0])   if isinstance(cend_val, (list, tuple, np.ndarray))   else int(cend_val)
    
    # Build the feature dictionary.
    feature = {
        'X': _float_feature(x_flat),
        'y': _float_feature(y_flat),
        'record_id': _bytes_feature(record_id_val),
        'cstart': _int64_feature([cstart_int]),
        'cend': _int64_feature([cend_int]),
        'strand': _bytes_feature(strand_val),
        'chunk_size': _int64_feature([chunk_size])
    }
    
    # Create a tf.train.Example message.
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def parse_chunk_example(serialized_example):
    """
    Parses a single serialized tf.train.Example back into tensors.
    Used in testing datasets and in piping tfrecords to DL Algorithms
    """
    feature_spec = {
        'X':          tf.io.VarLenFeature(tf.float32),
        'y':          tf.io.VarLenFeature(tf.float32),
        'record_id':  tf.io.FixedLenFeature([], tf.string),
        'cstart':     tf.io.FixedLenFeature([1], tf.int64),
        'cend':       tf.io.FixedLenFeature([1], tf.int64),
        'strand':     tf.io.FixedLenFeature([], tf.string),
        'chunk_size': tf.io.FixedLenFeature([1], tf.int64),
    }
    
    parsed = tf.io.parse_single_example(serialized_example, feature_spec)
    
    # chunk_size is shape [1]
    chunk_size = parsed['chunk_size'][0]
    
    # Convert sparse to dense
    X_flat = tf.sparse.to_dense(parsed['X'])
    y_flat = tf.sparse.to_dense(parsed['y'])

    # Reshape X to [chunk_size, 5]
    X_reshaped = tf.reshape(X_flat, [chunk_size, 5])
    # Reshape y to [chunk_size], probably redundant
    y_reshaped = tf.reshape(y_flat, [chunk_size, 5])
    
    record_id = parsed['record_id']
    cstart = parsed['cstart'][0]
    cend = parsed['cend'][0]
    strand = parsed['strand']
    
    return X_reshaped, y_reshaped, record_id, cstart, cend, strand


def prep_dataset_from_tfrecord_simple(
    tfrecord_path,
    batch_size=28,
    compression_type='GZIP',
    shuffled = False,
    shuffle_buffer=25000,
    total_records=None,
    num_to_drop=None,
    seed=None
):
    '''
    Imports tfrecord and shuffles it then parses it for use in fitting a model
    '''
    # Loads in records in a round robin fashion
    dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type=compression_type, num_parallel_reads = tf.data.AUTOTUNE)
    dataset = dataset.map(parse_chunk_example, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(serialize_example_with_metadata, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset




def write_tfrecord_with_metadata(dataset, filename, compression_type="GZIP"):
    """
    Iterates over the dataset and writes each sample (with metadata) to a TFRecord file.
    
    The input dataset should yield elements in the format:
        (X, y, record_id, cstart, cend, strand)
    If the dataset is batched, it is unbatched so that each TFRecord contains a single sample.
    """
    # Unbatch so that we write one sample per TFRecord.
    dataset = dataset.unbatch()
    
    options = tf.io.TFRecordOptions(compression_type=compression_type)
    with tf.io.TFRecordWriter(filename, options=options) as writer:
        for sample in dataset:
            # sample is a tuple: (X, y, record_id, cstart, cend, strand)
            x, y, record_id, cstart, cend, strand = sample
            serialized_example = serialize_example_with_metadata(x, y, record_id, cstart, cend, strand)
            writer.write(serialized_example)

In [3]:
import tensorflow as tf
import numpy as np

def convert_labels_to_binary(x, y):
    """
    Converts y so that any value not exactly 1 becomes 0.
    Both x and y are expected to be tensors of shape (chunk_size, 5).
    """
    y_binary = tf.cast(tf.equal(y, 1.0), y.dtype)
    return x, y_binary

def _float_feature(value):
    """Returns a float_list from a list of floats."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _int64_feature(value):
    """Returns an int64_list from a list of ints."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
    """Returns a bytes_list from a string (or byte string)."""
    if isinstance(value, str):
        value = value.encode('utf-8')
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def serialize_example_with_metadata_no_convert(x, y, record_id, cstart, cend, strand):
    """
    Serializes a single example into a tf.train.Example.
    Expects that y is already binary.
    x and y are tensors of shape (chunk_size, 5). All metadata is preserved.
    """
    # Flatten the X and y tensors into lists.
    x_flat = tf.reshape(x, [-1]).numpy().tolist()
    y_flat = tf.reshape(y, [-1]).numpy().tolist()
    
    # Determine chunk_size from the first dimension of x.
    chunk_size = int(x.shape[0])
    
    # Convert metadata to Python types.
    record_id_val = record_id.numpy() if isinstance(record_id, tf.Tensor) else record_id
    strand_val = strand.numpy() if isinstance(strand, tf.Tensor) else strand

    # cstart and cend are tensors of shape [1].
    cstart_val = cstart.numpy() if isinstance(cstart, tf.Tensor) else cstart
    cend_val   = cend.numpy()   if isinstance(cend, tf.Tensor)   else cend
    cstart_int = int(cstart_val[0]) if isinstance(cstart_val, (list, tuple, np.ndarray)) else int(cstart_val)
    cend_int   = int(cend_val[0])   if isinstance(cend_val, (list, tuple, np.ndarray))   else int(cend_val)
    
    feature = {
        'X': _float_feature(x_flat),
        'y': _float_feature(y_flat),
        'record_id': _bytes_feature(record_id_val),
        'cstart': _int64_feature([cstart_int]),
        'cend': _int64_feature([cend_int]),
        'strand': _bytes_feature(strand_val),
        'chunk_size': _int64_feature([chunk_size])
    }
    
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

def parse_chunk_example(serialized_example):
    """
    Parses a single serialized tf.train.Example back into tensors.
    Assumes the TFRecord contains metadata fields:
      - 'X': VarLenFeature(tf.float32)
      - 'y': VarLenFeature(tf.float32)
      - 'record_id': FixedLenFeature([], tf.string)
      - 'cstart': FixedLenFeature([1], tf.int64)
      - 'cend': FixedLenFeature([1], tf.int64)
      - 'strand': FixedLenFeature([], tf.string)
      - 'chunk_size': FixedLenFeature([1], tf.int64)
    """
    feature_spec = {
        'X':          tf.io.VarLenFeature(tf.float32),
        'y':          tf.io.VarLenFeature(tf.float32),
        'record_id':  tf.io.FixedLenFeature([], tf.string),
        'cstart':     tf.io.FixedLenFeature([1], tf.int64),
        'cend':       tf.io.FixedLenFeature([1], tf.int64),
        'strand':     tf.io.FixedLenFeature([], tf.string),
        'chunk_size': tf.io.FixedLenFeature([1], tf.int64),
    }
    
    parsed = tf.io.parse_single_example(serialized_example, feature_spec)
    
    # Extract chunk_size (a scalar)
    chunk_size = parsed['chunk_size'][0]
    
    # Convert sparse tensors to dense and reshape.
    X_flat = tf.sparse.to_dense(parsed['X'])
    y_flat = tf.sparse.to_dense(parsed['y'])
    X_reshaped = tf.reshape(X_flat, [chunk_size, 5])
    y_reshaped = tf.reshape(y_flat, [chunk_size, 5])
    
    record_id = parsed['record_id']
    cstart = parsed['cstart'][0]
    cend = parsed['cend'][0]
    strand = parsed['strand']
    
    return X_reshaped, y_reshaped, record_id, cstart, cend, strand

def convert_and_write_tfrecord(input_tfrecord, output_tfrecord, compression_type="GZIP"):
    """
    Reads an existing TFRecord (with smoothed labels), converts the labels to binary,
    and writes out a new TFRecord file with the same metadata.
    
    Args:
      input_tfrecord: Path to the original TFRecord file.
      output_tfrecord: Path where the new TFRecord (with binary labels) will be saved.
      compression_type: Compression type used in the TFRecord (e.g., "GZIP").
    """
    # Create a dataset from the input TFRecord.
    dataset = tf.data.TFRecordDataset(
        input_tfrecord,
        compression_type=compression_type,
        num_parallel_reads=tf.data.AUTOTUNE
    )
    
    # Parse each example.
    dataset = dataset.map(parse_chunk_example, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Convert labels to binary.
    def convert_sample(x, y, record_id, cstart, cend, strand):
        x, y_binary = convert_labels_to_binary(x, y)
        return x, y_binary, record_id, cstart, cend, strand
    
    dataset = dataset.map(convert_sample, num_parallel_calls=tf.data.AUTOTUNE)
    
    # Write out each converted sample to the new TFRecord file.
    options = tf.io.TFRecordOptions(compression_type=compression_type)
    with tf.io.TFRecordWriter(output_tfrecord, options=options) as writer:
        for sample in dataset:
            # sample is a tuple: (X, y_binary, record_id, cstart, cend, strand)
            X, y_binary, record_id, cstart, cend, strand = sample
            serialized_example = serialize_example_with_metadata_no_convert(
                X, y_binary, record_id, cstart, cend, strand)
            writer.write(serialized_example)

# Example usage:
# Convert and write new TFRecord files for train, validation, and test splits.
# convert_and_write_tfrecord("TestValTrain/train.tfrecord.gz", "TestValTrain/train_binary.tfrecord.gz")
# convert_and_write_tfrecord("TestValTrain/val.tfrecord.gz", "TestValTrain/val_binary.tfrecord.gz")
# convert_and_write_tfrecord("TestValTrain/test.tfrecord.gz", "TestValTrain/test_binary.tfrecord.gz")


In [9]:
batch_size = 28
epochs = 400  # Set high enough to allow stopping by callback
steps_per_epoch = 7178

print('Compiling train dataset')
train_dataset_path = "All_augmented_shuffled.tfrecord.gz"
dataset = tf.data.TFRecordDataset(train_dataset_path, compression_type='GZIP', num_parallel_reads = tf.data.AUTOTUNE)
dataset = dataset.map(parse_chunk_example, num_parallel_calls=tf.data.AUTOTUNE)                                

for X_batch, y_batch, record_id, cstart, cend, strand in dataset.take(1):
    # print("X shape:", X_batch.shape)
    # print("y shape:", y_batch.shape)
    print("record_id:", record_id)
    print("cstart:", cstart)
    print("cend:", cend)
    print("strand:", strand)
    for i in range(5000):
        print(X_batch[i], y_batch[i])


Compiling train dataset
record_id: tf.Tensor(b'chr4', shape=(), dtype=string)
cstart: tf.Tensor(57203000, shape=(), dtype=int64)
cend: tf.Tensor(57208000, shape=(), dtype=int64)
strand: tf.Tensor(b'+', shape=(), dtype=string)
tf.Tensor([0. 0. 1. 0. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.Tensor([0. 0. 1. 0. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.Tensor([0. 1. 0. 0. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.Tensor([0. 0. 0. 1. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.Tensor([0. 0. 1. 0. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.Tensor([0. 0. 1. 0. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.Tensor([0. 0. 0. 1. 1.], shape=(5,), dtype=float32) tf.Tensor([1. 0. 0. 0. 0.], shape=(5,), dtype=float32)
tf.T