Notebook of a lot of useful functions for getting tfrecords ready for training.  
Used for:

The initial dataset shuffling

Converting tfrecord to dataset for training

Checking tfrecords for correctness (quality and enough shuffling)

Test, Validate, Train splitting

In [None]:
import time
import sys
import os
import glob
import math
import threading
import concurrent.futures as cf

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Input, Model, layers, metrics, losses, callbacks, optimizers, models, utils
from keras import backend as K
import gc
import keras_tuner as kt
from pyfaidx import Fasta

K.clear_session()
gc.collect()

datasets_path = "../../Datasets/"
models_path = "../../Models/"

2025-03-12 21:56:28.662150: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-12 21:56:28.850249: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-12 21:56:28.903784: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-12 21:56:29.276700: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
def parse_chunk_example(serialized_example):
    """
    Parses a single serialized tf.train.Example back into tensors.
    Used in testing datasets and in piping tfrecords to DL Algorithms
    """
    feature_spec = {
        'X':          tf.io.VarLenFeature(tf.float32),
        'y':          tf.io.VarLenFeature(tf.float32),
        'record_id':  tf.io.FixedLenFeature([], tf.string),
        'cstart':     tf.io.FixedLenFeature([1], tf.int64),
        'cend':       tf.io.FixedLenFeature([1], tf.int64),
        'strand':     tf.io.FixedLenFeature([], tf.string),
        'chunk_size': tf.io.FixedLenFeature([1], tf.int64),
    }
    
    parsed = tf.io.parse_single_example(serialized_example, feature_spec)
    
    # chunk_size is shape [1]
    chunk_size = parsed['chunk_size'][0]
    
    # Convert sparse to dense
    X_flat = tf.sparse.to_dense(parsed['X'])
    y_flat = tf.sparse.to_dense(parsed['y'])

    # Reshape X to [chunk_size, 5]
    X_reshaped = tf.reshape(X_flat, [chunk_size, 5])
    # Reshape y to [chunk_size], probably redundant
    y_reshaped = tf.reshape(y_flat, [chunk_size, 5])
    
    record_id = parsed['record_id']
    cstart    = parsed['cstart'][0]
    cend      = parsed['cend'][0]
    strand    = parsed['strand']
    
    return X_reshaped, y_reshaped, record_id, cstart, cend, strand

In [4]:
def build_dataset_from_tfrecords(
    tfrecord_pattern,
    batch_size=28,
    compression_type='GZIP',
    shuffle_buffer=66000,
):
    '''
    Builds shuffled dataset from tfrecords.  Returns unparsed serialized
    dataset that is not human readable.  
    '''

    # Loads in records in a round robin fashion for slightly increased mixing
    files = tf.data.Dataset.list_files(tfrecord_pattern, shuffle=True)
    dataset = files.interleave(
        lambda fname: tf.data.TFRecordDataset(fname, compression_type=compression_type),
        cycle_length=4,        # how many files to read in parallel
        block_length=1,         # how many records to read from each file before switching
        num_parallel_calls=tf.data.AUTOTUNE
)
    
    # Shuffle at the record level
    dataset = dataset.shuffle(shuffle_buffer, reshuffle_each_iteration=True)

    # Shuffle at batch level
    dataset = dataset.batch(batch_size)
    dataset = dataset.shuffle(8*batch_size, reshuffle_each_iteration=True)
    dataset = dataset.unbatch()

    # Prefetch for efficient access
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
tfrecord_pattern = "Shuffling/Shuffle_9/shuffled_shard_*.tfrecord.gz"
ds = build_dataset_from_tfrecords(tfrecord_pattern,
                                  batch_size=32, compression_type='GZIP',
                                  shuffle_buffer=50000)

'''Commented out so I don't accidentally try to rewrite anything'''
# output_path = "Shuffling/Shuffle_10"
# if not os.path.exists(output_path):
#     os.makedirs(output_path)

# num_shards = 4
# writers = [
#     tf.io.TFRecordWriter(f"{output_path}/shuffled_shard_{i}.tfrecord.gz", options=options)
#     for i in range(num_shards)
# ]

# # Write out round-robin to each shard
# for i, serialized_example in enumerate(ds):
#     shard_index = i % num_shards
#     writers[shard_index].write(serialized_example.numpy())

# # Close all writers
# for w in writers:
#     w.close()

I0000 00:00:1739120295.628038     685 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739120295.822252     685 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739120295.822364     685 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739120295.827044     685 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1739120295.827114     685 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

In [5]:
from keras import backend as K
import gc
K.clear_session()
gc.collect()

0

In [11]:
def test_dataset_from_tfrecords(
    tfrecord_pattern,
    batch_size=32,
    compression_type='GZIP',
    shuffle_buffer=75000
):
    '''
    Imports tfrecord and shuffles it then parses it and returns a
    human readable dataset.  
    Two goals: 
        1. To confirm tfrecord(s) is/are saved properly
        2. To view list of record_ids in the batch to see if dataset 
            is sufficiently shuffled.  Ideally, a good spread of chrN
            shows up.
    '''
    # Loads in records in a round robin fashion for slightly increased mixing
    files = tf.data.Dataset.list_files(tfrecord_pattern, shuffle=True)
    dataset = files.interleave(
        lambda fname: tf.data.TFRecordDataset(fname, compression_type=compression_type),
        cycle_length=4,        # how many files to read in parallel
        block_length=1,         # how many records to read from each file before switching
        num_parallel_calls=tf.data.AUTOTUNE
)
    
    # Shuffle at the record level
    dataset = dataset.shuffle(shuffle_buffer, reshuffle_each_iteration=True)

    # Shuffle at batch level
    dataset = dataset.batch(batch_size)
    dataset = dataset.shuffle(8*batch_size, reshuffle_each_iteration=True)

    # Unbatch for parsing and parse
    dataset = dataset.unbatch()    
    dataset = dataset.map(parse_chunk_example, num_parallel_calls=tf.data.AUTOTUNE)

    # Rebatch parsed and prefetch for efficient reading
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [12]:
tfrecord_pattern = "Shuffling/Shuffle_10/shuffled_shard_*.tfrecord.gz"

ds = test_dataset_from_tfrecords(tfrecord_pattern,
                                  batch_size=32, compression_type='GZIP',
                                  shuffle_buffer=50000)

for X_batch, y_batch, record_id_batch, cstart_batch, cend_batch, strand_batch in ds.take(1):
    print("X shape:", X_batch.shape)
    print("y shape:", y_batch.shape)
    print("record_id:", record_id_batch)
    print("cstart:", cstart_batch)
    print("cend:", cend_batch)
    print("strand:", strand_batch)
    # for i in range(5000):
    #     print(f"Data: {X_batch[0][i]},   {y_batch[0][i]} :Label")
    # print(f"chr: {record_id_batch[0]}, cstart: {cstart_batch[0]}, cend: {cend_batch[0]}")

2025-02-09 14:37:40.941363: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:25: Filling up shuffle buffer (this may take a while): 49791 of 50000
2025-02-09 14:37:40.979486: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
2025-02-09 14:37:42.531447: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
2025-02-09 14:37:42.657657: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


X shape: (32, 5000, 5)
y shape: (32, 5000, 5)
record_id: tf.Tensor(
[b'chr22' b'chr5' b'chr14' b'chr1' b'chr15' b'chr7' b'chr3' b'chr4'
 b'chr12' b'chr2' b'chr11' b'chr7' b'chr9' b'chr8' b'chr5' b'chr8'
 b'chr13' b'chr2' b'chr8' b'chr22' b'chr16' b'chr16' b'chr18' b'chr2'
 b'chr19' b'chr18' b'chr14' b'chr10' b'chr9' b'chr6' b'chr12' b'chr16'], shape=(32,), dtype=string)
cstart: tf.Tensor(
[ 38420000  95860000 104730000  21215000  87875000  43145000 109365000
 176305000  65165000 186245000 126335000  47305000  91100000  27445000
  36065000  29900000  57135000 136135000  28445000  30535000  70980000
  12780000  32090000 227705000  35835000  12825000 104765000 130110000
 133765000 134640000  63570000  11625000], shape=(32,), dtype=int64)
cend: tf.Tensor(
[ 38425000  95865000 104735000  21220000  87880000  43150000 109370000
 176310000  65170000 186250000 126340000  47310000  91105000  27450000
  36070000  29905000  57140000 136140000  28450000  30540000  70985000
  12785000  32095000 2277

In [None]:
print("record_id:", record_id_batch)
print("cstart:", cstart_batch)
print("cend:", cend_batch)
print("strand:", strand_batch)

record_id: tf.Tensor(
[b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1'
 b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1'
 b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1' b'chr1'
 b'chr1' b'chr1' b'chr1' b'chr1' b'chr1'], shape=(32,), dtype=string)
cstart: tf.Tensor(
[ 42760000  26005000  72895000  50140000  46100000  87160000   1520000
  78365000  13160000  29705000   7440000  25875000  91825000  99965000
  46550000  21950000  40390000  62605000  20195000  15405000 100150000
  40255000  55170000  91380000  15725000  95180000  99840000  56145000
 100465000  46890000  86785000  35730000], shape=(32,), dtype=int64)
cend: tf.Tensor(
[ 42765000  26010000  72900000  50145000  46105000  87165000   1525000
  78370000  13165000  29710000   7445000  25880000  91830000  99970000
  46555000  21955000  40395000  62610000  20200000  15410000 100155000
  40260000  55175000  91385000  15730000  95185000  99845000  56150000
 100470000  4689

In [11]:
num_batches = 0
for _ in ds:
    num_batches += 1

print("Total number of batches:", num_batches)

Total number of batches: 6529


In [6]:
import tensorflow as tf

def split_tfrecords(original_pattern, train_path, val_path, test_path, train_frac=0.8, val_frac=0.10):
    """
    Splits TFRecord files into separate train, validation, and test sets *without parsing*.
    Reads raw serialized records and writes them into new TFRecord files.
    """
    options = tf.io.TFRecordOptions(compression_type="GZIP")
    # Create TFRecord writers
    train_writer = tf.io.TFRecordWriter(train_path, options=options)
    val_writer = tf.io.TFRecordWriter(val_path, options=options)
    test_writer = tf.io.TFRecordWriter(test_path, options=options)

    # List the original TFRecord files
    dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(original_pattern), compression_type='GZIP')
    
    num_records = 0
    for _ in dataset:
        num_records += 1
    print(f"Total records found: {num_records}")

    # Compute split sizes
    train_size = int(train_frac * num_records)
    val_size   = int(val_frac * num_records)
    test_size  = num_records - train_size - val_size  # Ensuring all records are accounted for

    print(f"Splitting into -> Train: {train_size}, Val: {val_size}, Test: {test_size}")

    # Iterate over records and write them to appropriate files
    train_count, val_count, test_count = 0, 0, 0
    dataset = tf.data.TFRecordDataset(tf.io.gfile.glob(original_pattern), compression_type='GZIP')
    dataset = dataset.shuffle(25000, reshuffle_each_iteration=True)

    for i, raw_record in enumerate(dataset):
        if i < train_size:
            train_writer.write(raw_record.numpy())
            train_count += 1
        elif i < train_size + val_size:
            val_writer.write(raw_record.numpy())
            val_count += 1
        else:
            test_writer.write(raw_record.numpy())
            test_count += 1

    # Close writers
    train_writer.close()
    val_writer.close()
    test_writer.close()

    print(f"Final Split Counts -> Train: {train_count}, Val: {val_count}, Test: {test_count}")

# Using function, commented out because accidentally trying to rewrite this would be annoying
# split_tfrecords(
#     original_pattern="Shuffling/Shuffle_10/shuffled_shard_*.tfrecord.gz",
#     train_path="TestValTrain/train.tfrecord.gz",
#     val_path="TestValTrain/val.tfrecord.gz",
#     test_path="TestValTrain/test.tfrecord.gz"
# )

In [7]:
directory = "AugDataSets/Redo/"
paths = os.listdir(directory)
for filename in paths:
    pattern = directory + filename
    split_tfrecords(
        original_pattern=pattern,
        train_path="TestValTrain/train_" + filename,
        val_path="TestValTrain/val_" + filename,
        test_path="TestValTrain/test_" + filename,
    )

I0000 00:00:1741838343.592610     716 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1741838343.792492     716 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1741838343.792619     716 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1741838343.797507     716 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1741838343.797613     716 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:0

Total records found: 709682
Splitting into -> Train: 567745, Val: 70968, Test: 70969


2025-03-12 22:03:02.584040: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:450] ShuffleDatasetV3:6: Filling up shuffle buffer (this may take a while): 20794 of 25000
2025-03-12 22:03:04.604399: I tensorflow/core/kernels/data/shuffle_dataset_op.cc:480] Shuffle buffer filled.
2025-03-12 22:35:52.672784: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Final Split Counts -> Train: 567745, Val: 70968, Test: 70969
Total records found: 472988
Splitting into -> Train: 378390, Val: 47298, Test: 47300
Final Split Counts -> Train: 378390, Val: 47298, Test: 47300


2025-03-12 23:00:05.699236: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


BELOW THIS POINT IS OLD DISORGANIZED AND PROBABLY NOT GOOD ANYMORE CODE MOSTLY RELATED TO MY INITIAL EXPERIMENTATION WITH LSTM NEURAL NETWORKS

In [30]:
def find_divisors(n):
    """
    Returns a list of all whole number divisors of the given integer n.
    """
    if n <= 0:
        raise ValueError("Input must be a positive integer.")
    
    divisors = []
    for i in range(1, int(n**0.5) + 1):
        if n % i == 0:
            divisors.append(i)
            if i != n // i:
                divisors.append(n // i)
    return sorted(divisors)

number = (10446+20890+177567-23)*0.10
divisors = find_divisors(number)
print(f"The divisors of {number} are: {divisors}")

The divisors of 20888.0 are: [1, 2, 4, 7, 8, 14, 28, 56, 373.0, 746.0, 1492.0, 2611.0, 2984.0, 5222.0, 10444.0, 20888.0]


The divisors of 177567 are: [1, 3, 13, 29, 39, 87, 157, 377, 471, 1131, 2041, 4553, 6123, 13659, 59189, 177567] drop 19

The divisors of 20890 are: [1, 2, 5, 10, 2089, 4178, 10445, 20890] drop 2

The divisors of 10446 are: [1, 2, 3, 6, 1741, 3482, 5223, 10446] drop 2

In [35]:
print(10446-(10446//28)*28)

2


In [50]:
from keras import backend as K
import gc
K.clear_session()
gc.collect()

0

In [28]:
import tensorflow as tf
from keras import Input, Model, layers, metrics, losses
from keras import backend as K
import gc
import numpy as np

In [29]:
class CustomNonZeroF1Score(tf.keras.metrics.Metric):
    def __init__(self, num_classes, average='weighted', name='non_zero_f1', **kwargs):
        """
        Custom F1 score metric that only considers non-zero classes.
        
        Args:
            num_classes (int): Total number of classes. Class 0 is assumed to be the "background" class.
            average (str): 'weighted' (default) to weight by support or 'macro' for a simple average.
            name (str): Name of the metric.
            **kwargs: Additional keyword arguments.
        """
        super(CustomNonZeroF1Score, self).__init__(name=name, **kwargs)
        self.num_classes = num_classes
        if average not in ['weighted', 'macro']:
            raise ValueError("average must be 'weighted' or 'macro'")
        self.average = average
        
        # Accumulate counts per class
        self.true_positives = self.add_weight(
            name='tp', shape=(num_classes,), initializer='zeros', dtype=tf.float32
        )
        self.false_positives = self.add_weight(
            name='fp', shape=(num_classes,), initializer='zeros', dtype=tf.float32
        )
        self.false_negatives = self.add_weight(
            name='fn', shape=(num_classes,), initializer='zeros', dtype=tf.float32
        )
    
    def update_state(self, y_true, y_pred, sample_weight=None):
        """
        Updates the confusion matrix statistics.
        
        Args:
            y_true: Tensor of shape (batch_size, seq_length) with integer class labels.
            y_pred: Tensor of shape (batch_size, seq_length, num_classes) with probability distributions.
            sample_weight: Optional sample weights.
        """
        # Convert predictions to class labels using argmax along the last axis.
        y_pred = tf.argmax(y_pred, axis=-1)
        
        # Flatten the batch and sequence dimensions.
        y_true = tf.reshape(y_true, [-1])
        y_pred = tf.reshape(y_pred, [-1])
        
        # Compute confusion matrix over all predictions.
        cm = tf.math.confusion_matrix(
            y_true, y_pred, num_classes=self.num_classes, dtype=tf.float32
        )
        tp = tf.linalg.diag_part(cm)
        fp = tf.reduce_sum(cm, axis=0) - tp
        fn = tf.reduce_sum(cm, axis=1) - tp
        
        # Update state variables.
        self.true_positives.assign_add(tp)
        self.false_positives.assign_add(fp)
        self.false_negatives.assign_add(fn)
    
    def result(self):
        """
        Computes the F1 score for non-zero classes.
        
        Returns:
            F1 score computed over the non-zero classes.
        """
        precision = tf.math.divide_no_nan(
            self.true_positives, self.true_positives + self.false_positives
        )
        recall = tf.math.divide_no_nan(
            self.true_positives, self.true_positives + self.false_negatives
        )
        f1 = tf.math.divide_no_nan(2 * precision * recall, precision + recall)
        
        # Exclude class 0 (the background) from the evaluation.
        f1_non_zero = f1[1:]
        support_non_zero = (self.true_positives + self.false_negatives)[1:]
        
        if self.average == 'weighted':
            # Weight F1 by the support of each class.
            weighted_f1 = tf.reduce_sum(f1_non_zero * support_non_zero) / (tf.reduce_sum(support_non_zero) + K.epsilon())
            return weighted_f1
        else:  # macro
            return tf.reduce_mean(f1_non_zero)
    
    def reset_states(self):
        """
        Resets the metric state variables.
        """
        for v in self.variables:
            v.assign(tf.zeros_like(v))

In [30]:
# def custom_f1(y_true, y_pred):
#     y_pred = tf.argmax(y_pred, axis=-1)  # Convert softmax to class indices
#     y_pred = tf.reshape(y_pred, (-1,))  # Flatten shape to (batch_size * sequence_length,)
#     y_true = tf.reshape(y_true, (-1,))  # Flatten labels

#     return metrics.F1Score(average="weighted")(y_true, y_pred)


In [None]:
### CNN Model ###
def create_cnn_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    inputs = Input(shape=(sequence_length, input_dim))

    # Convolutional blocks with pooling:
    cnn = layers.Conv1D(filters=196, kernel_size=5, activation='relu', padding='same')(inputs)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # Output shape: (None, 2500, 196)

    cnn = layers.Conv1D(filters=228, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # Output shape: (None, 1250, 228)

    cnn = layers.Conv1D(filters=228, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # Output shape: (None, 625, 228)

    # Upsample back to the original sequence length:
    # Here, 625 * 8 = 5000. (Make sure that the pooling factors multiply to an integer factor.)
    # cnn = layers.UpSampling1D(size=8)(cnn)  # Output shape: (None, 5000, 128)
    cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=8, padding='same', activation='relu')(cnn)

    # Instead of flattening, use Conv1D with kernel_size=1 as dense layers:
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)
    cnn = layers.Dropout(0.5)(cnn)
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)

    # Final classification layer applied at every time step:
    outputs = layers.Conv1D(num_classes, kernel_size=1, activation='softmax')(cnn)

    model = Model(inputs=inputs, outputs=outputs)
    return model

cnn_model = create_cnn_model(5, 5000, 5)
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
cnn_model.summary()


### LSTM Model ###
def create_lstm_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
    ):
    
    # Define the input layer with shape
    inputs = Input(shape=(sequence_length, input_dim))

    # Add a bidirectional LSTM that returns sequences
    lstm = layers.LSTM(36, return_sequences=True)(inputs)
    lstm = layers.Dropout(0.2)(lstm)

    # Optionally add a TimeDistributed dense layer for extra processing
    lstm = layers.TimeDistributed(layers.Dense(64, activation='relu'))(lstm)
    lstm = layers.TimeDistributed(layers.Dropout(0.5))(lstm)

    # Final TimeDistributed layer to get a prediction at each time step.
    outputs = layers.TimeDistributed(layers.Dense(num_classes, activation='softmax'))(lstm)

    # Create the model
    model = Model(inputs=inputs, outputs=outputs)
    return model

lstm_model = create_lstm_model(5, 5000, 5)
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
lstm_model.summary()


### Hybrid CNN -> LSTM Series Model ###
def create_cnn_to_lstm_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    input_shape = (sequence_length, input_dim)  # Input shape

    # Input layer (using a variable name that doesn’t shadow built-ins)
    inputs = Input(shape=input_shape)

    # 1D-CNN block
    cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(inputs)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # (None, 2500, 64)

    cnn = layers.Conv1D(filters=96, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # (None, 1250, 96)

    # Upsample back to original sequence length
    cnn = layers.UpSampling1D(size=4)(cnn)
    # Option to swap to trainable upscaler if basic upsampling is too lossy
    # cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=4, padding='same', activation='relu')(cnn)

    # LSTM block
    lstm = layers.LSTM(36, return_sequences=True)(cnn)
    lstm = layers.Dropout(0.2)(lstm)

    # TimeDistributed Dense processing
    lstm = layers.TimeDistributed(layers.Dense(64, activation='relu'))(lstm)
    lstm = layers.TimeDistributed(layers.Dropout(0.5))(lstm)

    # Output layer
    outputs = layers.TimeDistributed(layers.Dense(num_classes, activation='softmax'))(lstm)

    # Create and compile the model
    model = Model(inputs=inputs, outputs=outputs)
    return model

cnn_to_lstm_model = create_cnn_to_lstm_model(5, 5000, 5)
cnn_to_lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
cnn_to_lstm_model.summary()


### Hybrid LSTM -> CNN Series Model ###
def create_lstm_to_cnn_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    input_shape = (sequence_length, input_dim)  # Input shape  # One-hot encoded sequence

    # Input layer
    inputs = Input(shape=input_shape)

    # LSTM block
    lstm = layers.LSTM(36, return_sequences=True)(inputs)
    lstm = layers.Dropout(0.2)(lstm)

    # 1D-CNN block
    cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(lstm)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    # Upsample back to the original sequence length:
    cnn = layers.UpSampling1D(size=4)(cnn)
    # Option to swap to trainable upscaler if basic upsampling is too lossy
    # cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=4, padding='same', activation='relu')(cnn)

    # Instead of flattening, use Conv1D with kernel_size=1 as dense layers:
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)
    cnn = layers.Dropout(0.5)(cnn)
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)

    # Final classification layer applied at every time step:
    outputs = layers.Conv1D(num_classes, kernel_size=1, activation='softmax')(cnn)

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    return model

lstm_to_cnn_model = create_lstm_to_cnn_model(5, 5000, 5)
lstm_to_cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
lstm_to_cnn_model.summary()


### Hybrid CNN | LSTM Parallel Model ###
def create_parallel_hybrid_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    input_shape = (sequence_length, input_dim)

    # Input layer
    inputs = Input(shape=input_shape)
    
    # CNN Branch
    cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(inputs)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    # Instead of flattening, use Conv1D with kernel_size=1 as dense layers:
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)
    cnn = layers.Dropout(0.5)(cnn)
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)

    cnn = layers.UpSampling1D(size=4)(cnn)
    # Option to swap to trainable upscaler if basic upsampling is too lossy
    # cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=4, padding='same', activation='relu')(cnn)

    # LSTM Branch
    lstm = layers.LSTM(36, return_sequences=True)(inputs)
    lstm = layers.Dropout(0.2)(lstm)

    # Concatenate the branches
    combined = layers.Concatenate()([cnn, lstm])

    combined = layers.TimeDistributed(layers.Dense(64, activation='relu'))(combined)
    combined = layers.TimeDistributed(layers.Dropout(0.5))(combined)

    # Output layer
    outputs = layers.TimeDistributed(layers.Dense(num_classes, activation='softmax'))(combined)

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    return model

parallel_hybrid_model = create_parallel_hybrid_model(5, 5000, 5)
parallel_hybrid_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
parallel_hybrid_model.summary()

In [51]:
### CNN Model ###
def create_cnn_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    inputs = Input(shape=(sequence_length, input_dim))

    # Convolutional blocks with pooling:
    cnn = layers.Conv1D(filters=196, kernel_size=5, activation='relu', padding='same')(inputs)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # Output shape: (None, 2500, 196)

    cnn = layers.Conv1D(filters=228, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # Output shape: (None, 1250, 228)

    cnn = layers.Conv1D(filters=228, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # Output shape: (None, 625, 228)

    # Upsample back to the original sequence length:
    # Here, 625 * 8 = 5000. (Make sure that the pooling factors multiply to an integer factor.)
    # cnn = layers.UpSampling1D(size=8)(cnn)  # Output shape: (None, 5000, 128)
    cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=8, padding='same', activation='relu')(cnn)

    # Instead of flattening, use Conv1D with kernel_size=1 as dense layers:
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)
    cnn = layers.Dropout(0.5)(cnn)
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)

    # Final classification layer applied at every time step:
    outputs = layers.Conv1D(num_classes, kernel_size=1, activation='softmax')(cnn)

    model = Model(inputs=inputs, outputs=outputs)
    return model

cnn_model = create_cnn_model(5, 5000, 5)
cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
cnn_model.summary()

In [52]:
### LSTM Model ###
def create_lstm_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
    ):
    
    # Define the input layer with shape
    inputs = Input(shape=(sequence_length, input_dim))

    # Add a bidirectional LSTM that returns sequences
    lstm = layers.LSTM(36, return_sequences=True)(inputs)
    lstm = layers.Dropout(0.2)(lstm)

    # Optionally add a TimeDistributed dense layer for extra processing
    lstm = layers.TimeDistributed(layers.Dense(64, activation='relu'))(lstm)
    lstm = layers.TimeDistributed(layers.Dropout(0.5))(lstm)

    # Final TimeDistributed layer to get a prediction at each time step.
    outputs = layers.TimeDistributed(layers.Dense(num_classes, activation='softmax'))(lstm)

    # Create the model
    model = Model(inputs=inputs, outputs=outputs)
    return model

lstm_model = create_lstm_model(5, 5000, 5)
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
lstm_model.summary()

In [43]:
### Hybrid CNN -> LSTM Series Model ###
def create_cnn_to_lstm_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    input_shape = (sequence_length, input_dim)  # Input shape

    # Input layer (using a variable name that doesn’t shadow built-ins)
    inputs = Input(shape=input_shape)

    # 1D-CNN block
    cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(inputs)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # (None, 2500, 64)

    cnn = layers.Conv1D(filters=96, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)  # (None, 1250, 96)

    # Upsample back to original sequence length
    cnn = layers.UpSampling1D(size=4)(cnn)
    # Option to swap to trainable upscaler if basic upsampling is too lossy
    # cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=4, padding='same', activation='relu')(cnn)

    # LSTM block
    lstm = layers.LSTM(36, return_sequences=True)(cnn)
    lstm = layers.Dropout(0.2)(lstm)

    # TimeDistributed Dense processing
    lstm = layers.TimeDistributed(layers.Dense(64, activation='relu'))(lstm)
    lstm = layers.TimeDistributed(layers.Dropout(0.5))(lstm)

    # Output layer
    outputs = layers.TimeDistributed(layers.Dense(num_classes, activation='softmax'))(lstm)

    # Create and compile the model
    model = Model(inputs=inputs, outputs=outputs)
    return model

cnn_to_lstm_model = create_cnn_to_lstm_model(5, 5000, 5)
cnn_to_lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
cnn_to_lstm_model.summary()

In [44]:
### Hybrid LSTM -> CNN Series Model ###
def create_lstm_to_cnn_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    input_shape = (sequence_length, input_dim)  # Input shape  # One-hot encoded sequence

    # Input layer
    inputs = Input(shape=input_shape)

    # LSTM block
    lstm = layers.LSTM(36, return_sequences=True)(inputs)
    lstm = layers.Dropout(0.2)(lstm)

    # 1D-CNN block
    cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(lstm)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    # Upsample back to the original sequence length:
    cnn = layers.UpSampling1D(size=4)(cnn)
    # Option to swap to trainable upscaler if basic upsampling is too lossy
    # cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=4, padding='same', activation='relu')(cnn)

    # Instead of flattening, use Conv1D with kernel_size=1 as dense layers:
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)
    cnn = layers.Dropout(0.5)(cnn)
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)

    # Final classification layer applied at every time step:
    outputs = layers.Conv1D(num_classes, kernel_size=1, activation='softmax')(cnn)

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    return model

lstm_to_cnn_model = create_lstm_to_cnn_model(5, 5000, 5)
lstm_to_cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
lstm_to_cnn_model.summary()

In [45]:
### Hybrid CNN | LSTM Parallel Model ###
def create_parallel_hybrid_model(
    input_dim = 5,
    sequence_length = 5000,
    num_classes = 5
):

    input_shape = (sequence_length, input_dim)

    # Input layer
    inputs = Input(shape=input_shape)
    
    # CNN Branch
    cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(inputs)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
    cnn = layers.BatchNormalization()(cnn)
    cnn = layers.MaxPooling1D(pool_size=2)(cnn)

    # Instead of flattening, use Conv1D with kernel_size=1 as dense layers:
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)
    cnn = layers.Dropout(0.5)(cnn)
    cnn = layers.Conv1D(128, kernel_size=1, activation='relu')(cnn)

    cnn = layers.UpSampling1D(size=4)(cnn)
    # Option to swap to trainable upscaler if basic upsampling is too lossy
    # cnn = layers.Conv1DTranspose(filters=128, kernel_size=5, strides=4, padding='same', activation='relu')(cnn)

    # LSTM Branch
    lstm = layers.LSTM(36, return_sequences=True)(inputs)
    lstm = layers.Dropout(0.2)(lstm)

    # Concatenate the branches
    combined = layers.Concatenate()([cnn, lstm])

    combined = layers.TimeDistributed(layers.Dense(64, activation='relu'))(combined)
    combined = layers.TimeDistributed(layers.Dropout(0.5))(combined)

    # Output layer
    outputs = layers.TimeDistributed(layers.Dense(num_classes, activation='softmax'))(combined)

    # Create model
    model = Model(inputs=inputs, outputs=outputs)
    return model

parallel_hybrid_model = create_parallel_hybrid_model(5, 5000, 5)
parallel_hybrid_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')#, metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
parallel_hybrid_model.summary()

In [36]:
def drop_exact_records(dataset: tf.data.Dataset, total_records, num_to_drop, seed=None):
    if seed:
        np.random.seed(seed)
    drop_indices = set(np.random.choice(total_records, num_to_drop, replace=False))
    dataset = dataset.enumerate()
    dataset = dataset.filter(lambda i, x: ~tf.reduce_any(tf.equal(i, list(drop_indices))))
    dataset = dataset.map(lambda i, x: x)
    return dataset


def parse_chunk_example(serialized_example):
    """
    Parses a single serialized tf.train.Example back into tensors.
    Used in testing datasets and in piping tfrecords to DL Algorithms
    """
    feature_spec = {
        'X':          tf.io.VarLenFeature(tf.float32),
        'y':          tf.io.VarLenFeature(tf.int64),
        'record_id':  tf.io.FixedLenFeature([], tf.string),
        'cstart':     tf.io.FixedLenFeature([1], tf.int64),
        'cend':       tf.io.FixedLenFeature([1], tf.int64),
        'strand':     tf.io.FixedLenFeature([], tf.string),
        'chunk_size': tf.io.FixedLenFeature([1], tf.int64),
    }
    
    parsed = tf.io.parse_single_example(serialized_example, feature_spec)
    
    # chunk_size is shape [1]
    chunk_size = parsed['chunk_size'][0]
    
    # Convert sparse to dense
    X_flat = tf.sparse.to_dense(parsed['X'])
    y_flat = tf.sparse.to_dense(parsed['y'])

    # Reshape X to [chunk_size, 5]
    X_reshaped = tf.reshape(X_flat, [chunk_size, 5])
    # Reshape y to [chunk_size], probably redundant
    y_reshaped = tf.reshape(y_flat, [chunk_size])
    
    record_id = parsed['record_id']
    cstart    = parsed['cstart'][0]
    cend      = parsed['cend'][0]
    strand    = parsed['strand']
    
    return X_reshaped, y_reshaped, record_id, cstart, cend, strand


def prepare_for_model(X, y, record_id, cstart, cend, strand):
    '''
    Extracts and reshapes parsed data for feeding to DL Models
    '''
    # Expand last dimension of y from (batch_size, 5000) to (batch_size, 5000, 1)
    # y = tf.expand_dims(y, axis=-1) turns out this line is not needed
    # Return only (X, y). Discard the extra columns for training knowing that 
    # they still exist in the TestValTrain originals if we need them
    return X, y


def prep_dataset_from_tfrecord(
    tfrecord_path,
    batch_size=28,
    compression_type='GZIP',
    shuffled = False,
    shuffle_buffer=25000,
    total_records=None,
    num_to_drop=None,
    seed=None
):
    '''
    Imports tfrecord and shuffles it then parses it for use in fitting a model
    '''
    # Loads in records in a round robin fashion for slightly increased mixing
    dataset = tf.data.TFRecordDataset(tfrecord_path, compression_type=compression_type, num_parallel_reads = tf.data.AUTOTUNE)
    
    if num_to_drop:
        dataset = drop_exact_records(dataset, total_records=total_records, num_to_drop=num_to_drop, seed=seed)
    
    if shuffled == True:
        # Shuffle at the record level
        dataset = dataset.shuffle(shuffle_buffer, reshuffle_each_iteration=True)
        
    
    dataset = dataset.map(parse_chunk_example, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(prepare_for_model, num_parallel_calls=tf.data.AUTOTUNE)
    dataset = dataset.map(lambda x, y: (x, tf.cast(y, tf.int32))) # found out tensorflow wants int32 in y

    # Rebatch parsed and prefetch for efficient reading
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [37]:
# tfrecord_pattern = "TestValTrain/train.tfrecord.gz"

# ds = prep_dataset_from_tfrecord("TestValTrain/train.tfrecord.gz",
#                                 batch_size=32, 
#                                 compression_type='GZIP', 
#                                 shuffled=False,
#                                 shuffle_buffer=100000
#                                 )

# for X_batch, y_batch in ds.take(1):
#     print("X shape:", X_batch.shape)
#     print("y shape:", y_batch.shape)
# num_batches = 0
# for _ in ds:
#     num_batches += 1

# print("Total number of batches:", num_batches)

In [48]:
import time
from keras import callbacks
import sys

class TimeLimit(callbacks.Callback):
    def __init__(self, max_time_seconds):
        super().__init__()
        self.max_time_seconds = max_time_seconds
        self.start_time = None

    def on_train_begin(self, logs=None):
        self.start_time = time.time()

    # def on_batch_end(self, batch, logs=None):
    #     if time.time() - self.start_time > self.max_time_seconds:
    #         self.model.stop_training = True
    
    # def on_train_batch_end(self, batch, logs=None):  # ✅ Runs more frequently than `on_batch_end`
    #     elapsed_time = time.time() - self.start_time
    #     if elapsed_time > self.max_time_seconds:
    #         print(f"\n⏳ Time limit of {self.max_time_seconds} sec reached. Stopping training!")
    #         self.model.stop_training = True  # 🔥 Stops training mid-batch
    
    def on_train_batch_begin(self, batch, logs=None):
        elapsed_time = time.time() - self.start_time
        if elapsed_time > self.max_time_seconds:
            print(f"\n⏳ Time limit of {self.max_time_seconds} sec reached. Stopping training!")
            self.model.stop_training = True

    def on_epoch_end(self, epoch, logs=None):  # New method added
        if time.time() - self.start_time > self.max_time_seconds:
            self.model.stop_training = True
            
class DebugCallback(callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        print(f"\n🚀 Starting Epoch {epoch+1}")
        sys.stdout.flush()

    def on_batch_begin(self, batch, logs=None):
        if batch % 1000 == 0:
            print(f"🔄 Processing Batch {batch}")
            sys.stdout.flush()

    def on_batch_end(self, batch, logs=None):
        if batch % 1000 == 0:
            print(f"✅ Finished Batch {batch}")
            sys.stdout.flush()

    def on_epoch_end(self, epoch, logs=None):
        print(f"\n🏁 Epoch {epoch+1} Completed!")
        sys.stdout.flush()

In [68]:
train_dataset = prep_dataset_from_tfrecord("TestValTrain/train.tfrecord.gz",
                                batch_size=28, 
                                compression_type='GZIP', 
                                shuffled=True,
                                shuffle_buffer=25000,
                                total_records=177567,
                                num_to_drop=19
                                )

num_batches = 0
for _ in train_dataset:
    num_batches += 1

print("Total number of batches:", num_batches)

# for batch in train_dataset.take(1):
#     # print("X shape:", X_batch.shape)
#     # print("y shape:", y_batch.shape)
#     print(batch)
#     print(f"✅ Sample batch loaded! Shape: {batch[0].shape}, Labels: {batch[1].shape}")

2025-02-02 19:24:57.164452: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Total number of batches: 6341


In [24]:
# Training parameters
tf.debugging.set_log_device_placement(True)
max_time_seconds = 60  # 1 hour is 3600 seconds
batch_size = 7
epochs = 100  # Set high enough to allow stopping by time
steps_per_epoch = 6341 # 746 for validate, # 373 for test
time_limit_callback = TimeLimit(max_time_seconds=max_time_seconds)

# Dropping 23 records out of 208903 records gives an 85 10 5 split 
# with filled batches of 28 across the board.  Didn't feel like 
# rewriting to disk so I pull them out as they are fed to the model.
# seed is there to allow the same n samples to be dropped every time 
# the dataset is built
train_dataset = prep_dataset_from_tfrecord("TestValTrain/train.tfrecord.gz",
                                batch_size=7, 
                                compression_type='GZIP', 
                                shuffled=True,
                                shuffle_buffer=25000,
                                total_records=177567,
                                num_to_drop=19
                                )

val_dataset = prep_dataset_from_tfrecord("TestValTrain/val.tfrecord.gz",
                                batch_size=7, 
                                compression_type='GZIP', 
                                shuffled=False,
                                shuffle_buffer=25000,
                                total_records=20890,
                                num_to_drop=2,
                                seed=42
                                )

test_dataset = prep_dataset_from_tfrecord("TestValTrain/test.tfrecord.gz",
                                batch_size=7, 
                                compression_type='GZIP', 
                                shuffled=False,
                                shuffle_buffer=25000,
                                total_records=10446,
                                num_to_drop=2,
                                seed=42
                                )

# train_dataset = train_dataset.repeat()
# val_dataset = val_dataset.repeat()

# dl_models = [cnn_model, lstm_model, cnn_to_lstm_model, lstm_to_cnn_model, parallel_hybrid_model]

# Train 1D-CNN Model
# history_cnn = cnn_model.fit(
#     train_dataset, 
#     validation_data=val_dataset,
#     # batch_size=batch_size,
#     epochs=epochs,
#     steps_per_epoch=steps_per_epoch,
#     verbose=1,
#     callbacks=[DebugCallback()]
    # callbacks=[time_limit_callback]
# )

history_lstm = lstm_model.fit(
        train_dataset, 
        validation_data=val_dataset,
        # batch_size=batch_size,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch*4,
        callbacks=[time_limit_callback, DebugCallback()]
        )

# history_cnn_lstm = cnn_to_lstm_model.fit(
#     train_dataset, 
#     validation_data=val_dataset,
#     batch_size=batch_size,
#     epochs=epochs,
#     steps_per_epoch=steps_per_epoch,
#     callbacks=[time_limit_callback]
# )

# history_lstm_cnn = lstm_to_cnn_model.fit(
#     train_dataset, 
#     validation_data=val_dataset,
#     batch_size=batch_size,
#     epochs=epochs,
#     steps_per_epoch=steps_per_epoch,
#     callbacks=[time_limit_callback]
# )

# history_parallel_hybrid = parallel_hybrid_model.fit(
#     train_dataset, 
#     validation_data=val_dataset,
#     batch_size=batch_size,
#     epochs=epochs,
#     steps_per_epoch=steps_per_epoch,
#     callbacks=[time_limit_callback]
# )


🚀 Starting Epoch 1
Epoch 1/100
🔄 Processing Batch 0


2025-02-03 00:59:05.379757: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:531] Loaded cuDNN version 8907


✅ Finished Batch 0
[1m    2/25364[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m112:28:28[0m 16s/step - loss: 1.5366
⏳ Time limit of 60 sec reached. Stopping training!
[1m    3/25364[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m117:17:11[0m 17s/step - loss: 1.5075

KeyboardInterrupt: 

In [49]:
histories = {}
# dl_models = [cnn_model, lstm_model, cnn_to_lstm_model, lstm_to_cnn_model, parallel_hybrid_model] this line gets cleared out by last two lines, then the for loop just has a whole lot of Nones to look at

# The functions for creating models are written in jupyter boxes a few above
create_models = [create_cnn_model, create_lstm_model, create_cnn_to_lstm_model, create_lstm_to_cnn_model, create_parallel_hybrid_model]
for model_name, model_creator in zip(["cnn", "lstm", "cnn_to_lstm", "lstm_to_cnn", "parallel_hybrid"], create_models):
    print(f"\n🚀 Training {model_name}...\n")
    tf.debugging.set_log_device_placement(True)
    
    max_time_seconds = 60  # 1 hour is 3600 seconds
    batch_size = 7
    epochs = 100  # Set high enough to allow stopping by time
    steps_per_epoch = 6341*4

    time_limit_callback = TimeLimit(max_time_seconds=max_time_seconds)
    print('Compiling train dataset')
    train_dataset = prep_dataset_from_tfrecord("TestValTrain/train.tfrecord.gz",
                                    batch_size=7, 
                                    compression_type='GZIP', 
                                    shuffled=True,
                                    shuffle_buffer=25000,
                                    total_records=177567,
                                    num_to_drop=19
                                    )
    print('Compiling val dataset')
    val_dataset = prep_dataset_from_tfrecord("TestValTrain/val.tfrecord.gz",
                                    batch_size=7, 
                                    compression_type='GZIP', 
                                    shuffled=False,
                                    shuffle_buffer=25000,
                                    total_records=20890,
                                    num_to_drop=2,
                                    seed=42
                                    )

    # test_dataset = prep_dataset_from_tfrecord("TestValTrain/test.tfrecord.gz",
    #                                 batch_size=7, 
    #                                 compression_type='GZIP', 
    #                                 shuffled=False,
    #                                 shuffle_buffer=25000,
    #                                 total_records=10446,
    #                                 num_to_drop=2,
    #                                 seed=42
    #                                 )
    print("Creating next model")
    model = model_creator(5, 5000, 5)
    model.compile(optimizer='adam', loss=losses.SparseCategoricalCrossentropy(ignore_class= 0), metrics=[CustomNonZeroF1Score(num_classes=5, average='weighted')])
    model.summary()
    
    print('Fitting model')
    history = model.fit(
        train_dataset, 
        validation_data=val_dataset,
        # batch_size=batch_size,
        epochs=epochs,
        steps_per_epoch=steps_per_epoch,
        callbacks=[time_limit_callback, DebugCallback()]
    )

    print('Saving model...')
    model.save(f"Test_Models/{model_name}.h5")
    print(f"📁 Model {model_name} saved!")
    
    # 🔥 Clear GPU memory
    K.clear_session()
    gc.collect()
    print("Cleaned memory")


🚀 Training cnn...

Compiling train dataset
Compiling val dataset
Creating next model


Fitting model

🚀 Starting Epoch 1
Epoch 1/100
🔄 Processing Batch 0
✅ Finished Batch 0
[1m  998/25364[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m8:32[0m 21ms/step - loss: 0.6560 - non_zero_f1: 0.0014🔄 Processing Batch 1000
✅ Finished Batch 1000
[1m 2000/25364[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m8:09[0m 21ms/step - loss: 0.5837 - non_zero_f1: 0.0014🔄 Processing Batch 2000
✅ Finished Batch 2000
[1m 2201/25364[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m8:07[0m 21ms/step - loss: 0.5755 - non_zero_f1: 0.0015
⏳ Time limit of 60 sec reached. Stopping training!
[1m 2204/25364[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m8:07[0m 21ms/step - loss: 0.5754 - non_zero_f1: 0.0015
🏁 Epoch 1 Completed!
[1m25364/25364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 2ms/step - loss: 0.4998 - non_zero_f1: 0.0015 - val_loss: 0.4429 - val_non_zero_f1: 0.0017


2025-02-03 01:49:56.120287: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_2]]
2025-02-03 01:49:56.120349: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 5541738450647673380
2025-02-03 01:49:56.120386: I tensorflow/core/framework/local_rendezvous.cc:423] Local rendezvous recv item cancelled. Key hash: 17253768411172214452


Saving model...
📁 Model cnn saved!
Cleaned memory

🚀 Training lstm...

Compiling train dataset
Compiling val dataset
Creating next model


Fitting model

🚀 Starting Epoch 1
Epoch 1/100
🔄 Processing Batch 0
✅ Finished Batch 0
[1m    4/25364[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m105:40:15[0m 15s/step - loss: 1.6515 - non_zero_f1: 5.6019e-04
⏳ Time limit of 60 sec reached. Stopping training!
[1m    5/25364[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m104:46:14[0m 15s/step - loss: 1.6458 - non_zero_f1: 6.2971e-04

KeyboardInterrupt: 

In [3]:
import numpy as np
import tensorflow as tf
from keras import Sequential, layers, models, callbacks

In [None]:
import numpy as np
import tensorflow as tf
from keras import Sequential, layers, models, callbacks

# Create a simple model.
model = Sequential([
    layers.Dense(10, activation='relu', input_shape=(20,)),
    layers.Dense(1)
])
model.compile(optimizer='adam', loss='mse')

model2 = Sequential([
    layers.Dense(10, activation='relu', input_shape=(20,)),
    layers.Dense(1)
])
model2.compile(optimizer='adam', loss='mse')

# Generate dummy data.
X = np.random.random((1000, 20))
y = np.random.random((1000, 1))

# Use the TimeLimit callback for 30 seconds.
# time_limit_callback = TimeLimit(max_time_seconds=5)
model.fit(X, y, epochs=100, batch_size=32, callbacks=[BestModelCheckpoint])
print("begin model2")
# Generate dummy data.
X = np.random.random((1000, 20))
y = np.random.random((1000, 1))

# Use the TimeLimit callback for 30 seconds.
# time_limit_callback = TimeLimit(max_time_seconds=5)
model2.fit(X, y, epochs=100, batch_size=32, callbacks=[BestModelCheckpoint])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
I0000 00:00:1738651779.874635    1500 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1738651780.083018    1500 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1738651780.083127    1500 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1738651780.088142    1500 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/numa_node
Your kernel may have been built without NUMA support.
I0000 00:00:1738651780.088199    1500 cuda_executor.cc:1001] could not open file to read NUMA node: /sys/bus/pci/devices/0000:04:00.0/nu

TypeError: Callback.set_model() missing 1 required positional argument: 'model'

In [None]:
import tensorflow as tf
from keras import Input
from keras import Model, layers

# ### CNN Model ###
# # Adjusted 1D-CNN Model
# inputs = Input(shape=(sequence_length, 4))

# # Add more filters and layers to match the hybrid's complexity
# cnn = layers.Conv1D(filters=96, kernel_size=5, activation='relu', padding='same')(inputs)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)

# cnn = layers.Conv1D(filters=192, kernel_size=5, activation='relu', padding='same')(cnn)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)

# cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)

# # Fully connected layers
# flatten = layers.Flatten()(cnn)
# dense = layers.Dense(128, activation='relu')(flatten)
# dense = layers.Dropout(0.5)(dense)

# # Output layer
# output = layers.Dense(num_classes, activation='softmax')(dense)

# cnn_model = Model(inputs=inputs, outputs=output)

# # Compile and summarize
# cnn_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['f1_score'])
# cnn_model.summary()

# ### LSTM Model ###
# # Adjusted LSTM Model
# inputs = Input(shape=(sequence_length, 4))

# # Increase LSTM hidden size to match hybrid complexity
# lstm = layers.Bidirectional(layers.LSTM(192, return_sequences=True))(inputs)  # Larger hidden size
# lstm = layers.Bidirectional(layers.LSTM(192))(lstm)

# # Fully connected layers
# dense = layers.Dense(128, activation='relu')(lstm)
# dense = layers.Dropout(0.5)(dense)

# # Output layer
# output = layers.Dense(num_classes, activation='softmax')(dense)

# lstm_model = Model(inputs=inputs, outputs=output)

# # Compile and summarize
# lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['f1_score'])
# lstm_model.summary()

# ### Hybrid CNN -> LSTM Series Model ###
# # Define input shape
# input_shape = (sequence_length, 4)  # One-hot encoded sequence

# # Input layer
# inputs = Input(shape=input_shape)

# # 1D-CNN block
# cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(inputs)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)

# cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)

# # LSTM block
# lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(cnn)
# lstm = layers.Bidirectional(layers.LSTM(128))(lstm)

# # Fully connected layers
# dense = layers.Dense(128, activation='relu')(lstm)
# dense = layers.Dropout(0.5)(dense)

# # Output layer
# output = layers.Dense(num_classes, activation='softmax')(dense)  # For multi-class classification

# # Create model
# model = Model(inputs=inputs, outputs=output)

# # Compile model
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['f1_score'])

# # Model summary
# model.summary()

# ### Hybrid LSTM -> CNN Series Model ###
# # Define input shape
# input_shape = (sequence_length, 4)  # One-hot encoded sequence

# # Input layer
# inputs = Input(shape=input_shape)

# # LSTM block
# lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(inputs)
# lstm = layers.Bidirectional(layers.LSTM(128))(lstm)

# # 1D-CNN block
# cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(lstm)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)

# cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)

# # Fully connected layers
# dense = layers.Dense(128, activation='relu')(cnn)
# dense = layers.Dropout(0.5)(dense)

# # Output layer
# output = layers.Dense(num_classes, activation='softmax')(dense)  # For multi-class classification

# # Create model
# model = Model(inputs=inputs, outputs=output)

# # Compile model
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['f1_score'])

# # Model summary
# model.summary()

# ### Hybrid CNN | LSTM Parallel Model ###
# # Define input shape
# input_shape = (sequence_length, 4)  # One-hot encoded sequence

# # Input layer
# inputs = Input(shape=input_shape)

# # CNN Branch
# cnn = layers.Conv1D(filters=64, kernel_size=5, activation='relu', padding='same')(inputs)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)
# cnn = layers.Conv1D(filters=128, kernel_size=5, activation='relu', padding='same')(cnn)
# cnn = layers.MaxPooling1D(pool_size=2)(cnn)
# cnn = layers.Flatten()(cnn)  # Flatten for concatenation

# # LSTM Branch
# lstm = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(inputs)
# lstm = layers.Bidirectional(layers.LSTM(128))(lstm)  # Return a single vector

# # Concatenate the branches
# combined = layers.Concatenate()([cnn, lstm])

# # Fully connected layers
# dense = layers.Dense(128, activation='relu')(combined)
# dense = layers.Dropout(0.5)(dense)

# # Output layer
# output = layers.Dense(num_classes, activation='softmax')(dense)  # For multi-class classification

# # Create model
# parallel_hybrid_model = Model(inputs=inputs, outputs=output)

# # Compile model
# parallel_hybrid_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['f1_score'])

# # Model summary
# parallel_hybrid_model.summary()