In [2]:
import h5py
import numpy as np
import pandas as pd

In [3]:
# Load and examine the HDF5 dataset structure
file_path = "Dataset/GOLD_XYZ_OSC.0001_1024.hdf5"

# Open the HDF5 file and examine its structure
with h5py.File(file_path, 'r') as hdf5_file:
    print("Keys in the HDF5 file:", list(hdf5_file.keys()))
    
    # Check the shape of each dataset
    for key in hdf5_file.keys():
        print(f"{key} shape: {hdf5_file[key].shape}")
        print(f"{key} dtype: {hdf5_file[key].dtype}")
        print()
    
    # Show some sample data
    print("Sample X data (first frame, first 5 samples):")
    print(hdf5_file['X'][0][:5])
    print()
    
    print("Sample Y data (first 10 labels):")
    print(hdf5_file['Y'][:10])
    print()
    
    print("Sample Z data (first 10 SNR values):")
    print(hdf5_file['Z'][:10])

Keys in the HDF5 file: ['X', 'Y', 'Z']
X shape: (2555904, 1024, 2)
X dtype: float32

Y shape: (2555904, 24)
Y dtype: int64

Z shape: (2555904, 1)
Z dtype: int64

Sample X data (first frame, first 5 samples):
[[ 0.0420274   0.23476323]
 [-0.2728826   0.40513492]
 [-0.26707262  0.22749889]
 [-0.31485087 -0.1764586 ]
 [ 0.96334124 -1.0257102 ]]

Sample Y data (first 10 labels):
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

Sample Z data (first 10 SNR values):
[[-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]]


In [4]:
# Load class names mapping
import json
import os

with open("Dataset/classes-fixed.json", 'r') as f:
    class_names = json.load(f)

print("Modulation classes:", class_names)
print(f"Total classes: {len(class_names)}")

# Define SNR range (-20 to +30 in steps of 2)
snr_values = list(range(-20, 32, 2))
print("SNR values:", snr_values)
print(f"Total SNR values: {len(snr_values)}")

# Verify the total frames calculation
frames_per_snr = 4096
total_expected_frames = len(class_names) * len(snr_values) * frames_per_snr
print(f"Expected total frames: {total_expected_frames}")
print(f"Actual total frames in dataset: 2555904")
print(f"Match: {total_expected_frames == 2555904}")

Modulation classes: ['OOK', '4ASK', '8ASK', 'BPSK', 'QPSK', '8PSK', '16PSK', '32PSK', '16APSK', '32APSK', '64APSK', '128APSK', '16QAM', '32QAM', '64QAM', '128QAM', '256QAM', 'AM-SSB-WC', 'AM-SSB-SC', 'AM-DSB-WC', 'AM-DSB-SC', 'FM', 'GMSK', 'OQPSK']
Total classes: 24
SNR values: [-20, -18, -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
Total SNR values: 26
Expected total frames: 2555904
Actual total frames in dataset: 2555904
Match: True


In [5]:
# Create directory structure: snr/classname/
base_dir = "Dataset"

print("Creating directory structure...")
for snr in snr_values:
    for class_name in class_names:
        # Create directory path: Dataset/snr_value/class_name/
        dir_path = os.path.join(base_dir, f"snr_{snr}", class_name)
        os.makedirs(dir_path, exist_ok=True)
        
print("Directory structure created successfully!")

# Test: list some created directories
print("\nSample directories created:")
for snr in snr_values[:3]:  # Show first 3 SNR values
    for class_name in class_names[:3]:  # Show first 3 classes
        dir_path = os.path.join(base_dir, f"snr_{snr}", class_name)
        if os.path.exists(dir_path):
            print(f"✓ {dir_path}")
        else:
            print(f"✗ {dir_path}")

Creating directory structure...
Directory structure created successfully!

Sample directories created:
✓ Dataset\snr_-20\OOK
✓ Dataset\snr_-20\4ASK
✓ Dataset\snr_-20\8ASK
✓ Dataset\snr_-18\OOK
✓ Dataset\snr_-18\4ASK
✓ Dataset\snr_-18\8ASK
✓ Dataset\snr_-16\OOK
✓ Dataset\snr_-16\4ASK
✓ Dataset\snr_-16\8ASK


In [6]:
# Frame extraction and saving logic
def extract_and_save_frames():
    """
    Extract frames from HDF5 file and save them as .py files
    Structure: Dataset/snr_value/class_name/frame_N.py
    """
    
    print("Starting frame extraction...")
    
    with h5py.File(file_path, 'r') as hdf5_file:
        X = hdf5_file['X']  # I/Q data
        Y = hdf5_file['Y']  # One-hot encoded labels
        Z = hdf5_file['Z']  # SNR values
        
        total_frames = X.shape[0]
        frames_per_snr_class = 4096  # 4096 frames per modulation-SNR combination
        
        frame_idx = 0
        processed_frames = 0
        
        # Iterate through each modulation class
        for class_idx, class_name in enumerate(class_names):
            print(f"Processing class {class_idx + 1}/{len(class_names)}: {class_name}")
            
            # Iterate through each SNR value
            for snr_idx, snr_value in enumerate(snr_values):
                
                # Create directory path
                dir_path = os.path.join(base_dir, f"snr_{snr_value}", class_name)
                
                # Extract frames for this class-SNR combination
                for frame_num in range(frames_per_snr_class):
                    # Get the frame data
                    frame_data = X[frame_idx]  # Shape: (1024, 2)
                    label_data = Y[frame_idx]  # Shape: (24,) - one-hot encoded
                    snr_data = Z[frame_idx][0]  # Single SNR value
                    
                    # Verify this frame belongs to the expected class and SNR
                    expected_class_idx = np.argmax(label_data)
                    if expected_class_idx != class_idx or snr_data != snr_value:
                        print(f"Warning: Frame {frame_idx} has unexpected class {expected_class_idx} or SNR {snr_data}")
                    
                    # Create the .py file content
                    file_content = f'''# Frame {frame_num} for {class_name} at {snr_value} dB SNR
# Generated from GOLD_XYZ_OSC.0001_1024.hdf5

import numpy as np

# Frame metadata
class_name = "{class_name}"
class_index = {class_idx}
snr_db = {snr_value}
frame_number = {frame_num}

# I/Q data shape: (1024, 2) - [I, Q] components
frame_data = np.array({frame_data.tolist()}, dtype=np.float32)

# One-hot encoded label (24 classes)
label = np.array({label_data.tolist()}, dtype=np.int64)

# Usage example:
# i_component = frame_data[:, 0]  # In-phase component
# q_component = frame_data[:, 1]  # Quadrature component
'''
                    
                    # Save the file
                    file_name = f"frame_{frame_num}.py"
                    file_path_save = os.path.join(dir_path, file_name)
                    
                    with open(file_path_save, 'w') as f:
                        f.write(file_content)
                    
                    frame_idx += 1
                    processed_frames += 1
                    
                    # Progress update every 10000 frames
                    if processed_frames % 10000 == 0:
                        print(f"Processed {processed_frames}/{total_frames} frames ({processed_frames/total_frames*100:.1f}%)")
                
                print(f"  Completed SNR {snr_value} dB: {frames_per_snr_class} frames saved")
            
            print(f"Completed class {class_name}: {len(snr_values) * frames_per_snr_class} frames saved")
        
        print(f"\\nExtraction complete! Total frames processed: {processed_frames}")
        print(f"Total files created: {processed_frames}")

# Note: This will take some time due to the large number of files (2.5M+ frames)
print("Ready to extract frames. This will create 2,555,904 files.")
print("Run extract_and_save_frames() to start the extraction process.")

Ready to extract frames. This will create 2,555,904 files.
Run extract_and_save_frames() to start the extraction process.


In [7]:
# Test extraction with a small subset (first 2 classes, first 2 SNR values, first 5 frames each)
def test_extraction():
    """
    Test the extraction process with a small subset of data
    """
    print("Starting test extraction...")
    
    with h5py.File(file_path, 'r') as hdf5_file:
        X = hdf5_file['X']  # I/Q data
        Y = hdf5_file['Y']  # One-hot encoded labels
        Z = hdf5_file['Z']  # SNR values
        
        frame_idx = 0
        processed_frames = 0
        
        # Test with first 2 classes and first 2 SNR values
        test_classes = class_names[:2]  # ['OOK', '4ASK']
        test_snr_values = snr_values[:2]  # [-20, -18]
        test_frames_per_snr_class = 5  # Just 5 frames for testing
        
        # Iterate through test classes
        for class_idx, class_name in enumerate(test_classes):
            print(f"Processing test class {class_idx + 1}/{len(test_classes)}: {class_name}")
            
            # Iterate through test SNR values
            for snr_idx, snr_value in enumerate(test_snr_values):
                
                # Create directory path
                dir_path = os.path.join(base_dir, f"snr_{snr_value}", class_name)
                
                print(f"  Processing SNR {snr_value} dB...")
                
                # Extract frames for this class-SNR combination
                for frame_num in range(test_frames_per_snr_class):
                    # Calculate the actual frame index in the dataset
                    actual_frame_idx = (class_idx * len(snr_values) + snr_idx) * 4096 + frame_num
                    
                    # Get the frame data
                    frame_data = X[actual_frame_idx]  # Shape: (1024, 2)
                    label_data = Y[actual_frame_idx]  # Shape: (24,) - one-hot encoded
                    snr_data = Z[actual_frame_idx][0]  # Single SNR value
                    
                    # Verify this frame belongs to the expected class and SNR
                    expected_class_idx = np.argmax(label_data)
                    print(f"    Frame {frame_num}: Expected class {class_idx}, got {expected_class_idx}, Expected SNR {snr_value}, got {snr_data}")
                    
                    # Create the .py file content
                    file_content = f'''# Frame {frame_num} for {class_name} at {snr_value} dB SNR
# Generated from GOLD_XYZ_OSC.0001_1024.hdf5

import numpy as np

# Frame metadata
class_name = "{class_name}"
class_index = {expected_class_idx}
snr_db = {snr_data}
frame_number = {frame_num}

# I/Q data shape: (1024, 2) - [I, Q] components
frame_data = np.array({frame_data.tolist()}, dtype=np.float32)

# One-hot encoded label (24 classes)
label = np.array({label_data.tolist()}, dtype=np.int64)

# Usage example:
# i_component = frame_data[:, 0]  # In-phase component
# q_component = frame_data[:, 1]  # Quadrature component
'''
                    
                    # Save the file
                    file_name = f"frame_{frame_num}.py"
                    file_path_save = os.path.join(dir_path, file_name)
                    
                    with open(file_path_save, 'w') as f:
                        f.write(file_content)
                    
                    processed_frames += 1
                
                print(f"  Completed SNR {snr_value} dB: {test_frames_per_snr_class} frames saved")
        
        print(f"\\nTest extraction complete! Total frames processed: {processed_frames}")
        return processed_frames

# Run the test extraction
test_frames = test_extraction()

Starting test extraction...
Processing test class 1/2: OOK
  Processing SNR -20 dB...
    Frame 0: Expected class 0, got 0, Expected SNR -20, got -20
    Frame 1: Expected class 0, got 0, Expected SNR -20, got -20
    Frame 2: Expected class 0, got 0, Expected SNR -20, got -20
    Frame 3: Expected class 0, got 0, Expected SNR -20, got -20
    Frame 4: Expected class 0, got 0, Expected SNR -20, got -20
  Completed SNR -20 dB: 5 frames saved
  Processing SNR -18 dB...
    Frame 0: Expected class 0, got 0, Expected SNR -18, got -18
    Frame 1: Expected class 0, got 0, Expected SNR -18, got -18
    Frame 2: Expected class 0, got 0, Expected SNR -18, got -18
    Frame 3: Expected class 0, got 0, Expected SNR -18, got -18
    Frame 4: Expected class 0, got 0, Expected SNR -18, got -18
  Completed SNR -18 dB: 5 frames saved
Processing test class 2/2: 4ASK
  Processing SNR -20 dB...
    Frame 0: Expected class 1, got 1, Expected SNR -20, got -20
    Frame 1: Expected class 1, got 1, Expected

In [8]:
# Verify the test files were created
import glob

print("Checking created test files...")

test_dirs = ["Dataset/snr_-20/OOK", "Dataset/snr_-20/4ASK", "Dataset/snr_-18/OOK", "Dataset/snr_-18/4ASK"]

for test_dir in test_dirs:
    if os.path.exists(test_dir):
        files = glob.glob(os.path.join(test_dir, "*.py"))
        print(f"{test_dir}: {len(files)} files")
        if files:
            # Show first file content preview
            with open(files[0], 'r') as f:
                content = f.read()
                lines = content.split('\n')
                print(f"  Sample file: {os.path.basename(files[0])}")
                print(f"  First 10 lines:")
                for i, line in enumerate(lines[:10]):
                    print(f"    {i+1}: {line}")
                print("    ...")
            print()
    else:
        print(f"{test_dir}: Directory not found")

Checking created test files...
Dataset/snr_-20/OOK: 5 files
  Sample file: frame_0.py
  First 10 lines:
    1: # Frame 0 for OOK at -20 dB SNR
    2: # Generated from GOLD_XYZ_OSC.0001_1024.hdf5
    3: 
    4: import numpy as np
    5: 
    6: # Frame metadata
    7: class_name = "OOK"
    8: class_index = 0
    9: snr_db = -20
    10: frame_number = 0
    ...

Dataset/snr_-20/4ASK: 5 files
  Sample file: frame_0.py
  First 10 lines:
    1: # Frame 0 for 4ASK at -20 dB SNR
    2: # Generated from GOLD_XYZ_OSC.0001_1024.hdf5
    3: 
    4: import numpy as np
    5: 
    6: # Frame metadata
    7: class_name = "4ASK"
    8: class_index = 1
    9: snr_db = -20
    10: frame_number = 0
    ...

Dataset/snr_-18/OOK: 5 files
  Sample file: frame_0.py
  First 10 lines:
    1: # Frame 0 for OOK at -18 dB SNR
    2: # Generated from GOLD_XYZ_OSC.0001_1024.hdf5
    3: 
    4: import numpy as np
    5: 
    6: # Frame metadata
    7: class_name = "OOK"
    8: class_index = 0
    9: snr_db = -18
 

In [9]:
print("To run the full extraction, run the line below.")
print("This will create:")
print(f"- {len(class_names)} modulation classes")
print(f"- {len(snr_values)} SNR values per class")
print(f"- 4096 frames per class-SNR combination")
print(f"- Total: {len(class_names) * len(snr_values) * 4096} files")
print()
print("Directory structure will be:")
print("Dataset/")
for snr in snr_values[:3]:
    print(f"  snr_{snr}/")
    for class_name in class_names[:3]:
        print(f"    {class_name}/")
        print(f"      frame_0.py")
        print(f"      frame_1.py")
        print(f"      ...")
        print(f"      frame_4095.py")
    print("    ...")
print("  ...")
print()
print("Each .py file contains:")
print("- Frame metadata (class name, SNR, frame number)")
print("- I/Q data as numpy array (1024 samples, 2 components)")
print("- One-hot encoded label")
print("- Usage examples")

To run the full extraction, run the line below.
This will create:
- 24 modulation classes
- 26 SNR values per class
- 4096 frames per class-SNR combination
- Total: 2555904 files

Directory structure will be:
Dataset/
  snr_-20/
    OOK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    4ASK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    8ASK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    ...
  snr_-18/
    OOK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    4ASK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    8ASK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    ...
  snr_-16/
    OOK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    4ASK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    8ASK/
      frame_0.py
      frame_1.py
      ...
      frame_4095.py
    ...
  ...

Each .py file contains:
- Frame metadata (class name, S

In [None]:
# Full extraction - WARNING: This will create 2,555,904 files and may take several hours!
extract_and_save_frames()