This notebook was used for building shuffled tfrecords using window shifted tfrecords generated in StreamExonIntron.
I needed a better way to shuffle rows than using tensorflow shuffle buffers which have a memory leak in the version
I'm stuck using due to hardware compatibility.

In [None]:
import time
import sys
import os
import glob
import math
import threading
import concurrent.futures as cf
import random
import re

import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Input, Model, layers, metrics, losses, callbacks, optimizers, models, utils
from keras import backend as K
import gc
import keras_tuner as kt
from pyfaidx import Fasta

K.clear_session()
gc.collect()

datasets_path = "../../Datasets/"
models_path = "../../Models/"

2025-03-12 01:12:30.865646: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-03-12 01:12:31.052955: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-03-12 01:12:31.106170: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-03-12 01:12:31.475723: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
'''
This cell splits tfrecords made in StreamExonIntron into 24 smaller tfrecords.
The goal is to randomize the order of the smaller tfrecords when grabbing a row
to produce highly shuffled data
'''

# Define directories.
input_directory = datasets_path + "Augmented TFRecords/"
output_directory = datasets_path + "Shuffle Shards/"
save_directory = datasets_path + "AugDataSets/"
input_path = os.path.join(save_directory, input_directory)
output_path = os.path.join(save_directory, output_directory)
os.makedirs(output_path, exist_ok=True)

input_file_names = os.listdir(input_path)
input_file_paths = [os.path.join(input_path, file) for file in input_file_names]

# Process each input file.
for file in input_file_paths:
    # Example filename: "1000_inex_shard-0002.tfrecord.gz"
    basename = os.path.basename(file)
    # Split at "_inex_shard-"
    set_index, remainder = basename.split("_inex_shard-")
    # Get the first 4 digits from remainder (ignoring the extension)
    sub_index = remainder[:4]
    # Use the final digit of the sub-index.
    final_digit = sub_index[-1]
    
    # === First pass: Count records without loading them all ===
    total_records = 0
    for _ in tf.data.TFRecordDataset(file, compression_type="GZIP"):
        total_records += 1

    # Compute even splits: base chunk size and distribute any remainder.
    num_splits = 24
    chunk_size = total_records // num_splits
    remainder_count = total_records % num_splits

    # Pre-calculate boundaries for each split.
    boundaries = []
    start = 0
    for i in range(num_splits):
        extra = 1 if i < remainder_count else 0
        end = start + chunk_size + extra
        boundaries.append(end)
        start = end

    # Open all 24 TFRecord writers.
    writers = []
    for i in range(num_splits):
        sub_sub_index = f"{i:02d}"
        new_filename = f"{set_index}_{final_digit}_{sub_sub_index}_tiny_inex_shard.tfrecord.gz"
        new_filepath = os.path.join(output_path, new_filename)
        options = tf.io.TFRecordOptions(compression_type="GZIP")
        writer = tf.io.TFRecordWriter(new_filepath, options=options)
        writers.append(writer)

    # === Second pass: Write records to the appropriate shard in a streaming fashion ===
    current_index = 0
    current_shard = 0
    # Get a fresh dataset iterator.
    for record in tf.data.TFRecordDataset(file, compression_type="GZIP"):
        # Check if we have reached the boundary for the current shard.
        if current_index >= boundaries[current_shard]:
            current_shard += 1
        # Write the record to the current shard.
        writers[current_shard].write(record.numpy())
        current_index += 1

    # Close all writers.
    for writer in writers:
        writer.close()

    print(f"Processed {basename}: {total_records} records split into {num_splits} tiny shards.")

2025-03-12 01:17:11.738184: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Processed 1667_inex_shard-0003.tfrecord.gz: 59032 records split into 24 tiny shards.


2025-03-12 01:20:08.508017: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Processed 1667_inex_shard-0000.tfrecord.gz: 59117 records split into 24 tiny shards.
Processed 1667_inex_shard-0001.tfrecord.gz: 58965 records split into 24 tiny shards.


2025-03-12 01:26:06.644419: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


Processed 1667_inex_shard-0002.tfrecord.gz: 59359 records split into 24 tiny shards.


In [None]:
def split_tfrecords(input_directory, output_directory, num_splits=24):
    """
    Splits each TFRecord in the input_directory (assumed to be gzipped TFRecords)
    into 'num_splits' smaller TFRecords. The goal is to randomize the order of the
    smaller TFRecords when grabbing a row to produce highly shuffled data.
    
    Each output file is named based on the original filename's components.
    
    Parameters:
      input_directory (str): Relative or absolute path to the input TFRecords.
      output_directory (str): Relative or absolute path where the smaller shards will be saved.
      num_splits (int): Number of splits (shards) to create per input file.
    """
    
    save_directory = datasets_path + "AugDataSets/"
    input_path = os.path.join(save_directory, input_directory)
    output_path = os.path.join(save_directory, output_directory)
    os.makedirs(output_path, exist_ok=True)

    input_file_names = os.listdir(input_path)
    input_file_paths = [os.path.join(input_path, file) for file in input_file_names]

    for file in input_file_paths:
        # Example filename: "1000_inex_shard-0002.tfrecord.gz"
        basename = os.path.basename(file)
        # Split at "_inex_shard-"
        set_index, remainder = basename.split("_inex_shard-")
        # Get the first 4 digits from remainder (ignoring the extension)
        sub_index = remainder[:4]
        # Use the final digit of the sub-index.
        final_digit = sub_index[-1]
        
        # First pass: Count records without loading them all.
        total_records = 0
        for _ in tf.data.TFRecordDataset(file, compression_type="GZIP"):
            total_records += 1

        # Compute even splits: base chunk size and distribute any remainder.
        chunk_size = total_records // num_splits
        remainder_count = total_records % num_splits

        # Pre-calculate boundaries for each split.
        boundaries = []
        start = 0
        for i in range(num_splits):
            extra = 1 if i < remainder_count else 0
            end = start + chunk_size + extra
            boundaries.append(end)
            start = end

        # Open all the TFRecord writers.
        writers = []
        for i in range(num_splits):
            sub_sub_index = f"{i:02d}"
            new_filename = f"{set_index}_{final_digit}_{sub_sub_index}_tiny_inex_shard.tfrecord.gz"
            new_filepath = os.path.join(output_path, new_filename)
            options = tf.io.TFRecordOptions(compression_type="GZIP")
            writer = tf.io.TFRecordWriter(new_filepath, options=options)
            writers.append(writer)

        # Second pass: Write records to the appropriate shard in a streaming fashion.
        current_index = 0
        current_shard = 0
        for record in tf.data.TFRecordDataset(file, compression_type="GZIP"):
            if current_index >= boundaries[current_shard]:
                current_shard += 1
            writers[current_shard].write(record.numpy())
            current_index += 1

        # Close all writers.
        for writer in writers:
            writer.close()

        print(f"Processed {basename}: {total_records} records split into {num_splits} tiny shards.")

In [None]:
# input_directory = datasets_path + "Augmented TFRecords/"
# output_directory = datasets_path + "Shuffle Shards/"
# split_tfrecords(input_directory, output_directory, 24)

In [None]:
def stream_shuffled_records(input_dir, allowed_indices):
    """
    Lazily iterates over TFRecord files in input_dir whose filenames start with one of the allowed_indices.
    Each round, it shuffles the list of file iterators and yields one record per file.
    Files that are exhausted are removed from future rounds.
    
    Args:
        input_dir (str): Directory containing the TFRecord files.
        allowed_indices (list): List of allowed starting indices (as strings or integers).
    
    Yields:
        A TFRecord (as a tf.Tensor) from one of the files.
    """
    # List file paths that start with one of the allowed indices.
    file_paths = [os.path.join(input_dir, fname)
                  for fname in os.listdir(input_dir)
                  if any(fname.startswith(str(idx)) for idx in allowed_indices)]
    
    if not file_paths:
        raise ValueError("No TFRecord files found matching allowed indices.")
    
    # Create a list of (file_path, iterator) tuples.
    file_iterators = [(fp, iter(tf.data.TFRecordDataset(fp, compression_type="GZIP")))
                      for fp in file_paths]
    
    # Continue until all iterators are exhausted.
    while file_iterators:
        random.shuffle(file_iterators)
        next_file_iterators = []
        for fp, iterator in file_iterators:
            try:
                record = next(iterator)
                yield record
                next_file_iterators.append((fp, iterator))
            except StopIteration:
                print(f"File {fp} is exhausted and will be skipped.")
        file_iterators = next_file_iterators

def write_shuffled_records_to_single_tfrecord(input_dir, allowed_indices, output_filepath):
    """
    Writes all records produced by stream_shuffled_records into one big gzip-compressed TFRecord file.
    
    Args:
        input_dir (str): Directory containing the source TFRecord files.
        allowed_indices (list): List of allowed starting indices.
        output_filepath (str): Full path to the output TFRecord file.
    """
    # Set up the TFRecord writer with gzip compression.
    options = tf.io.TFRecordOptions(compression_type="GZIP")
    writer = tf.io.TFRecordWriter(output_filepath, options=options)
    
    record_count = 0
    # Stream through the records.
    for record in stream_shuffled_records(input_dir, allowed_indices):
        writer.write(record.numpy())
        record_count += 1
        # Print a status update every 1000 records.
        if record_count % 1000 == 0:
            print(f"{record_count} records written...")
    
    writer.close()
    print(f"Finished writing {record_count} records to {output_filepath}")

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = ["0000", "2000"]  # Index shifts allowed.  Choose from: [0000, 1000, 1250, 1667, 2000, 2500, 3000, 3334, 3750, 4000]
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "00002000_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

1000 records written...
2000 records written...
3000 records written...
4000 records written...
5000 records written...
6000 records written...
7000 records written...
8000 records written...
9000 records written...
10000 records written...
11000 records written...
12000 records written...
13000 records written...
14000 records written...
15000 records written...
16000 records written...
17000 records written...
18000 records written...
19000 records written...
20000 records written...
21000 records written...
22000 records written...
23000 records written...
24000 records written...
25000 records written...
26000 records written...
27000 records written...
28000 records written...
29000 records written...
30000 records written...
31000 records written...
32000 records written...
33000 records written...
34000 records written...
35000 records written...
36000 records written...
37000 records written...
38000 records written...
39000 records written...
40000 records written...
41000 rec

2025-03-07 02:33:56.475665: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/2000_1_18_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_08_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_13_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_15_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_19_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_07_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_20_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_17_tiny_inex_shard.tfrecord.gz is exhausted and will 

2025-03-07 02:33:59.682969: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/0000_0_12_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_03_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_08_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_09_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_10_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_06_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_08_tiny_inex_shard.tfrecord.gz is exhausted and will 

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = ["0000", "1000", "2000", "3000", "4000"]  # Example indices.
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "All_augmented_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

1000 records written...
2000 records written...
3000 records written...
4000 records written...
5000 records written...
6000 records written...
7000 records written...
8000 records written...
9000 records written...
10000 records written...
11000 records written...
12000 records written...
13000 records written...
14000 records written...
15000 records written...
16000 records written...
17000 records written...
18000 records written...
19000 records written...
20000 records written...
21000 records written...
22000 records written...
23000 records written...
24000 records written...
25000 records written...
26000 records written...
27000 records written...
28000 records written...
29000 records written...
30000 records written...
31000 records written...
32000 records written...
33000 records written...
34000 records written...
35000 records written...
36000 records written...
37000 records written...
38000 records written...
39000 records written...
40000 records written...
41000 rec

2025-03-07 06:36:05.065676: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/1000_2_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_19_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_10_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_20_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_13_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_2_02_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_14_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_12_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_21_tiny_inex_shard.tfrecord.gz is exhausted and will 

2025-03-07 06:36:19.122416: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/3000_0_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3000_0_06_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_0_21_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_0_16_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_0_09_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_0_13_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_0_07_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3000_0_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_1_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3000_0_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_3_03_tiny_inex_shard.tfrecord.gz is exhausted and will 

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = ["0000", "2000", "3000"]  # Example indices.
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "000020003000_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

1000 records written...
2000 records written...
3000 records written...
4000 records written...
5000 records written...
6000 records written...
7000 records written...
8000 records written...
9000 records written...
10000 records written...
11000 records written...
12000 records written...
13000 records written...
14000 records written...
15000 records written...
16000 records written...
17000 records written...
18000 records written...
19000 records written...
20000 records written...
21000 records written...
22000 records written...
23000 records written...
24000 records written...
25000 records written...
26000 records written...
27000 records written...
28000 records written...
29000 records written...
30000 records written...
31000 records written...
32000 records written...
33000 records written...
34000 records written...
35000 records written...
36000 records written...
37000 records written...
38000 records written...
39000 records written...
40000 records written...
41000 rec

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = ["0000", "1000", "2000", "3000"]  # Example indices.
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "0000100020003000_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

1000 records written...
2000 records written...
3000 records written...
4000 records written...
5000 records written...
6000 records written...
7000 records written...
8000 records written...
9000 records written...
10000 records written...
11000 records written...
12000 records written...
13000 records written...
14000 records written...
15000 records written...
16000 records written...
17000 records written...
18000 records written...
19000 records written...
20000 records written...
21000 records written...
22000 records written...
23000 records written...
24000 records written...
25000 records written...
26000 records written...
27000 records written...
28000 records written...
29000 records written...
30000 records written...
31000 records written...
32000 records written...
33000 records written...
34000 records written...
35000 records written...
36000 records written...
37000 records written...
38000 records written...
39000 records written...
40000 records written...
41000 rec

2025-03-07 11:48:45.022786: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/1000_2_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
942000 records written...
File Shuffle Shards/0000_2_14_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_15_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_13_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_17_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_21_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_18_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_19_tiny_inex_shard.tfrecord

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = ["0000", "2500"]  # Example indices.
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "00002500_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = ["0000", "1000", "2000", "3000"]  # Example indices.
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "0000100020003000_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = [0000, 1000, 1250, 1667, 2000, 2500, 3000, 3334, 3750, 4000]  # Index shifts allowed.  Choose from: [0000, 1000, 1250, 1667, 2000, 2500, 3000, 3334, 3750, 4000]
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "all_expanded_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

1000 records written...
2000 records written...
3000 records written...
4000 records written...
5000 records written...
6000 records written...
7000 records written...
8000 records written...
9000 records written...
10000 records written...
11000 records written...
12000 records written...
13000 records written...
14000 records written...
15000 records written...
16000 records written...
17000 records written...
18000 records written...
19000 records written...
20000 records written...
21000 records written...
22000 records written...
23000 records written...
24000 records written...
25000 records written...
26000 records written...
27000 records written...
28000 records written...
29000 records written...
30000 records written...
31000 records written...
32000 records written...
33000 records written...
34000 records written...
35000 records written...
36000 records written...
37000 records written...
38000 records written...
39000 records written...
40000 records written...
41000 rec

2025-03-09 14:46:52.642813: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/4000_2_14_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_2_01_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3750_2_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_2_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_16_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_12_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_20_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3750_2_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_2_21_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_2_03_tiny_inex_shard.tfrecord.gz is exhausted and will 

2025-03-09 14:47:01.413785: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/0000_2_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_16_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_14_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_19_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_10_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_09_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
2358000 records written...
File Shuffle Shards/0000_2_03_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2000_1_09_tiny_inex_shard.tfrecor

2025-03-09 14:47:15.678986: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/1667_3_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1250_2_03_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1250_2_08_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1667_3_17_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1667_3_19_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1250_2_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1667_3_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1667_3_16_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3334_2_20_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3334_2_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3334_2_12_tiny_inex_shard.tfrecord.gz is exhausted and will 

2025-03-09 14:47:25.714473: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/3750_1_14_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3750_1_09_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_0_01_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_1_06_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_01_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/1000_3_08_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_02_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/3750_1_01_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/4000_1_21_tiny_inex_shard.tfrecord.gz is exhausted and will 

2365261 Total records

In [None]:
if __name__ == "__main__":
    # Define the directory containing your shuffled tiny shards.
    input_directory = datasets_path + "Shuffle Shards"  # Adjust as needed.
    
    # Define the allowed starting indices (adjust to your needs).
    allowed_indices = [0000, 2500]  # Index shifts allowed.  Choose from: [0000, 1000, 1250, 1667, 2000, 2500, 3000, 3334, 3750, 4000]
    
    # Define the output filepath for the big combined TFRecord.
    output_filename = "00002500_shuffled.tfrecord.gz"
    save_directory = datasets_path + "AugDataSets/"
    output_filepath = os.path.join(save_directory, output_filename)
    
    # Run the function to write the big TFRecord.
    write_shuffled_records_to_single_tfrecord(input_directory, allowed_indices, output_filepath)

1000 records written...
2000 records written...
3000 records written...
4000 records written...
5000 records written...
6000 records written...
7000 records written...
8000 records written...
9000 records written...
10000 records written...
11000 records written...
12000 records written...
13000 records written...
14000 records written...
15000 records written...
16000 records written...
17000 records written...
18000 records written...
19000 records written...
20000 records written...
21000 records written...
22000 records written...
23000 records written...
24000 records written...
25000 records written...
26000 records written...
27000 records written...
28000 records written...
29000 records written...
30000 records written...
31000 records written...
32000 records written...
33000 records written...
34000 records written...
35000 records written...
36000 records written...
37000 records written...
38000 records written...
39000 records written...
40000 records written...
41000 rec

2025-03-12 02:18:18.286003: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/0000_2_01_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_03_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_06_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_07_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_09_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_08_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_10_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_2_02_tiny_inex_shard.tfrecord.gz is exhausted and will 

2025-03-12 02:19:15.827147: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


472000 records written...
File Shuffle Shards/2500_0_21_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_20_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_18_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_17_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_10_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_02_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_07_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_0_12_tiny_inex_shard.tfrecord

2025-03-12 02:19:18.327437: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/2500_1_06_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_00_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_04_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_01_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_03_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_07_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_05_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_02_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_1_08_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_0_16_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/0000_1_23_tiny_inex_shard.tfrecord.gz is exhausted and will 

2025-03-12 02:19:19.323740: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


File Shuffle Shards/2500_3_06_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_2_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_2_16_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_3_22_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_2_18_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_3_14_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_2_23_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_3_17_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_3_03_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_3_11_tiny_inex_shard.tfrecord.gz is exhausted and will be skipped.
File Shuffle Shards/2500_3_05_tiny_inex_shard.tfrecord.gz is exhausted and will 