# SharedMemoryDataLoader Demo & Benchmark

This notebook demonstrates how to use the SharedMemoryDataLoader and runs a small benchmark to test its performance.


In [1]:
import time
import torch
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display, clear_output

# Use the new factory pattern!
from data_loader import actvs_loader_from_test_config

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
batch_size = 1000
loader = actvs_loader_from_test_config(batch_size=batch_size)

In [4]:
# Wait a bit for buffer to fill

print("üì¶ Loading first batch...")
# Load first batch
batch = next(iter(loader))

print(f"‚úÖ Successfully loaded batch!")
print(f"   Shape: {batch.shape}")
print(f"   Data type: {batch.dtype}")
print(f"   Device: {batch.device}")
print(f"   Memory usage: {batch.numel() * batch.element_size() / 1024 / 1024:.2f} MB")

# Show some sample data
print(f"\nüìä Sample data (first 5 sequences, first 10 tokens):")
print(batch[:5, :10])


üì¶ Loading first batch...


Token indices sequence length is longer than the specified maximum sequence length for this model (1198 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1328 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1228 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (4882 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1702 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence leng

Buffer:  2,046/100,000 (  2.0%) | Rate: 4387/s | Up:     0s | Device: GPU | GENERATING‚úÖ Successfully loaded batch!
   Shape: torch.Size([1000, 2, 12, 768])
   Data type: torch.float32
   Device: cpu
   Memory usage: 70.31 MB

üìä Sample data (first 5 sequences, first 10 tokens):
tensor([[[[ 1.3722e+00, -2.8015e-02,  2.4323e-01,  ..., -1.3980e-01,
           -4.8099e-02,  6.9061e-02],
          [ 1.9641e+00, -2.4010e-01,  1.3248e+00,  ..., -1.3025e+00,
           -2.7582e-02, -1.2355e+00],
          [ 1.9556e+00, -1.6953e-02,  1.6919e+00,  ..., -1.4190e+00,
            3.0624e-01, -8.4634e-01],
          ...,
          [-7.1036e-01,  1.0561e+00,  1.5696e+00,  ..., -1.0380e+01,
            5.5126e+00, -4.8519e+00],
          [ 7.2985e-01,  8.7606e-02,  2.2963e+00,  ..., -1.2693e+01,
            4.9563e+00, -3.9042e+00],
          [ 2.8182e-01, -2.9107e+00,  6.5465e-01,  ..., -1.2993e+01,
            5.3244e+00, -3.6609e+00]],

         [[ 3.7553e-01, -1.6159e-01,  8.9176e-01,  ..., -7

Buffer: 80,840/100,000 ( 80.8%) | Rate: 69954/s | Up:     8s | Device: GPU | GENERATING[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Buffer: 100,000/100,000 (100.0%) | Rate:  760/s | Up: 339:51 | Device: CPU |   SLEEPING[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[

In [5]:
def run_mini_benchmark(loader, duration=10, batch_size=1000):
    """Run a mini benchmark for the specified duration."""
    print(f"üèÉ Running mini benchmark for {duration} seconds...")
    
    # Get first batch to calculate memory per batch
    first_batch = next(iter(loader))
    bytes_per_batch = first_batch.numel() * first_batch.element_size()
    mb_per_batch = bytes_per_batch / (1024 * 1024)
    
    print(f"   Batch size: {batch_size} samples")
    print(f"   Memory per batch: {mb_per_batch:.2f} MB")
    print(f"   Starting benchmark...\n")
    
    start_time = time.time()
    batch_count = 0
    total_bytes = 0
    batch_times = []
    
    try:
        while time.time() - start_time < duration:
            batch_start = time.time()
            batch = next(iter(loader))
            batch_time = time.time() - batch_start
            
            batch_count += 1
            total_bytes += bytes_per_batch
            batch_times.append(batch_time * 1000)  # Convert to ms
            
            # Print progress every few batches
            if batch_count % 3 == 0:
                elapsed = time.time() - start_time
                current_mb_s = (total_bytes / (1024 * 1024)) / elapsed
                print(f"   Batch {batch_count}: {batch_time*1000:.1f}ms, Running avg: {current_mb_s:.1f} MB/s")
    
    except Exception as e:
        print(f"   ‚ö†Ô∏è  Benchmark interrupted: {e}")
    
    # Calculate final results
    total_time = time.time() - start_time
    total_mb = total_bytes / (1024 * 1024)
    mb_per_second = total_mb / total_time
    samples_per_second = (batch_count * batch_size) / total_time
    avg_batch_time = np.mean(batch_times)
    
    return {
        'duration': total_time,
        'batches': batch_count,
        'total_samples': batch_count * batch_size,
        'total_mb': total_mb,
        'mb_per_second': mb_per_second,
        'samples_per_second': samples_per_second,
        'avg_batch_time_ms': avg_batch_time,
        'batch_times': batch_times
    }

# Run the benchmark
results = run_mini_benchmark(loader, duration=10, batch_size=batch_size)


üèÉ Running mini benchmark for 10 seconds...
   Batch size: 1000 samples
   Memory per batch: 70.31 MB
   Starting benchmark...

   Batch 3: 45.0ms, Running avg: 2230.8 MB/s
   Batch 6: 92.8ms, Running avg: 1307.7 MB/s
   Batch 9: 100.3ms, Running avg: 1051.3 MB/s
   Batch 12: 86.6ms, Running avg: 1008.2 MB/s
   Batch 15: 58.0ms, Running avg: 1049.3 MB/s
Buffer: 85,046/100,000 ( 85.0%) | Rate: 2122/s | Up:    23s | Device: CPU | GENERATING   Batch 18: 82.6ms, Running avg: 1064.3 MB/s
   Batch 21: 81.4ms, Running avg: 1058.0 MB/s
   Batch 24: 77.6ms, Running avg: 1030.5 MB/s
   Batch 27: 81.6ms, Running avg: 1035.2 MB/s
   Batch 30: 100.0ms, Running avg: 1015.1 MB/s
Buffer: 71,092/100,000 ( 71.1%) | Rate: 1904/s | Up:    24s | Device: CPU | GENERATING   Batch 33: 37.3ms, Running avg: 1048.3 MB/s
   Batch 36: 62.5ms, Running avg: 1036.7 MB/s
   Batch 39: 62.6ms, Running avg: 1030.0 MB/s
   Batch 42: 77.3ms, Running avg: 1017.7 MB/s
   Batch 45: 68.2ms, Running avg: 1003.2 MB/s
   Batch 

Buffer: 67,484/100,000 ( 67.5%) | Rate: 72644/s | Up:    36s | Device: GPU | GENERATING[2K[2K[2K[2K[2K[2K

In [6]:
# Print benchmark results
print("\nBenchmark Results:")
print(f"Duration: {results['duration']:.1f} seconds")
print(f"Total batches: {results['batches']:,}")
print(f"Total samples: {results['total_samples']:,}")
print(f"Data throughput: {results['mb_per_second']:.1f} MB/s")
print(f"Sample throughput: {results['samples_per_second']:.1f} samples/s")
print(f"Average batch time: {results['avg_batch_time_ms']:.1f} ms")



Benchmark Results:
Duration: 10.2 seconds
Total batches: 142
Total samples: 142,000
Data throughput: 981.9 MB/s
Sample throughput: 13965.1 samples/s
Average batch time: 71.5 ms


Buffer: 100,000/100,000 (100.0%) | Rate: 4234/s | Up:   5:36 | Device: CPU |   SLEEPING[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K[2K