In [27]:
import h5py
import numpy as np
import pandas as pd

In [28]:
# Load and examine the HDF5 dataset structure
file_path = "Dataset/GOLD_XYZ_OSC.0001_1024.hdf5"

# Open the HDF5 file and examine its structure
with h5py.File(file_path, 'r') as hdf5_file:
    print("Keys in the HDF5 file:", list(hdf5_file.keys()))
    
    # Check the shape of each dataset
    for key in hdf5_file.keys():
        print(f"{key} shape: {hdf5_file[key].shape}")
        print(f"{key} dtype: {hdf5_file[key].dtype}")
        print()
    
    # Show some sample data
    print("Sample X data (first frame, first 5 samples):")
    print(hdf5_file['X'][0][:5])
    print()
    
    print("Sample Y data (first 10 labels):")
    print(hdf5_file['Y'][:10])
    print()
    
    print("Sample Z data (first 10 SNR values):")
    print(hdf5_file['Z'][:10])

Keys in the HDF5 file: ['X', 'Y', 'Z']
X shape: (2555904, 1024, 2)
X dtype: float32

Y shape: (2555904, 24)
Y dtype: int64

Z shape: (2555904, 1)
Z dtype: int64

Sample X data (first frame, first 5 samples):
[[ 0.0420274   0.23476323]
 [-0.2728826   0.40513492]
 [-0.26707262  0.22749889]
 [-0.31485087 -0.1764586 ]
 [ 0.96334124 -1.0257102 ]]

Sample Y data (first 10 labels):
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]

Sample Z data (first 10 SNR values):
[[-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]
 [-20]]


In [29]:
# Load class names mapping
import json
import os

with open("Dataset/classes-fixed.json", 'r') as f:
    class_names = json.load(f)

print("Modulation classes:", class_names)
print(f"Total classes: {len(class_names)}")

# Define SNR range (-20 to +30 in steps of 2)
snr_values = list(range(-20, 32, 2))
print("SNR values:", snr_values)
print(f"Total SNR values: {len(snr_values)}")

# Verify the total frames calculation
frames_per_snr = 4096
total_expected_frames = len(class_names) * len(snr_values) * frames_per_snr
print(f"Expected total frames: {total_expected_frames}")
print(f"Actual total frames in dataset: 2555904")
print(f"Match: {total_expected_frames == 2555904}")

Modulation classes: ['OOK', '4ASK', '8ASK', 'BPSK', 'QPSK', '8PSK', '16PSK', '32PSK', '16APSK', '32APSK', '64APSK', '128APSK', '16QAM', '32QAM', '64QAM', '128QAM', '256QAM', 'AM-SSB-WC', 'AM-SSB-SC', 'AM-DSB-WC', 'AM-DSB-SC', 'FM', 'GMSK', 'OQPSK']
Total classes: 24
SNR values: [-20, -18, -16, -14, -12, -10, -8, -6, -4, -2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
Total SNR values: 26
Expected total frames: 2555904
Actual total frames in dataset: 2555904
Match: True


In [30]:
# Create directory structure: snr/classname/
base_dir = "Dataset"

print("Creating directory structure...")
for snr in snr_values:
    for class_name in class_names:
        # Create directory path: Dataset/snr_value/class_name/
        dir_path = os.path.join(base_dir, f"snr_{snr}", class_name)
        os.makedirs(dir_path, exist_ok=True)
        
print("Directory structure created successfully!")

# Test: list some created directories
print("\nSample directories created:")
for snr in snr_values[:3]:  # Show first 3 SNR values
    for class_name in class_names[:3]:  # Show first 3 classes
        dir_path = os.path.join(base_dir, f"snr_{snr}", class_name)
        if os.path.exists(dir_path):
            print(f"✓ {dir_path}")
        else:
            print(f"✗ {dir_path}")

Creating directory structure...
Directory structure created successfully!

Sample directories created:
✓ Dataset\snr_-20\OOK
✓ Dataset\snr_-20\4ASK
✓ Dataset\snr_-20\8ASK
✓ Dataset\snr_-18\OOK
✓ Dataset\snr_-18\4ASK
✓ Dataset\snr_-18\8ASK
✓ Dataset\snr_-16\OOK
✓ Dataset\snr_-16\4ASK
✓ Dataset\snr_-16\8ASK


In [31]:
# Frame extraction and saving logic for .npy format
def extract_and_save_frames():
    """
    Extract frames from HDF5 file and save them as .npy files
    Structure: Dataset/snr_value/class_name/frame_N.npy
    Format: 2x1024 array where row 0 = I component, row 1 = Q component
    """
    
    print("Starting frame extraction...")
    
    with h5py.File(file_path, 'r') as hdf5_file:
        X = hdf5_file['X']  # I/Q data
        Y = hdf5_file['Y']  # One-hot encoded labels
        Z = hdf5_file['Z']  # SNR values
        
        total_frames = X.shape[0]
        frames_per_snr_class = 4096  # 4096 frames per modulation-SNR combination
        
        frame_idx = 0
        processed_frames = 0
        
        # Iterate through each modulation class
        for class_idx, class_name in enumerate(class_names):
            print(f"Processing class {class_idx + 1}/{len(class_names)}: {class_name}")
            
            # Iterate through each SNR value
            for snr_idx, snr_value in enumerate(snr_values):
                
                # Create directory path
                dir_path = os.path.join(base_dir, f"snr_{snr_value}", class_name)
                
                # Extract frames for this class-SNR combination
                for frame_num in range(frames_per_snr_class):
                    # Get the frame data
                    frame_data = X[frame_idx]  # Shape: (1024, 2)
                    label_data = Y[frame_idx]  # Shape: (24,) - one-hot encoded
                    snr_data = Z[frame_idx][0]  # Single SNR value
                    
                    # Verify this frame belongs to the expected class and SNR
                    expected_class_idx = np.argmax(label_data)
                    if expected_class_idx != class_idx or snr_data != snr_value:
                        print(f"Warning: Frame {frame_idx} has unexpected class {expected_class_idx} or SNR {snr_data}")
                    
                    # Reshape data to 2x1024 format: row 0 = I, row 1 = Q
                    i_component = frame_data[:, 0]  # Extract I component (1024,)
                    q_component = frame_data[:, 1]  # Extract Q component (1024,)
                    
                    # Create 2x1024 array
                    frame_array = np.vstack([i_component, q_component])  # Shape: (2, 1024)
                    
                    # Save as .npy file
                    file_name = f"frame_{frame_num}.npy"
                    file_path_save = os.path.join(dir_path, file_name)
                    
                    np.save(file_path_save, frame_array)
                    
                    frame_idx += 1
                    processed_frames += 1
                    
                    # Progress update every 10000 frames
                    if processed_frames % 10000 == 0:
                        print(f"Processed {processed_frames}/{total_frames} frames ({processed_frames/total_frames*100:.1f}%)")
                
                print(f"  Completed SNR {snr_value} dB: {frames_per_snr_class} frames saved")
            
            print(f"Completed class {class_name}: {len(snr_values) * frames_per_snr_class} frames saved")
        
        print(f"\\nExtraction complete! Total frames processed: {processed_frames}")
        print(f"Total files created: {processed_frames}")

# Note: This will take some time due to the large number of files (2.5M+ frames)
print("Ready to extract frames as .npy files with 2x1024 format.")
print("Run extract_and_save_frames() to start the extraction process.")

Ready to extract frames as .npy files with 2x1024 format.
Run extract_and_save_frames() to start the extraction process.


In [32]:
print("To run the full extraction, run the line below.")
print("This will create:")
print(f"- {len(class_names)} modulation classes")
print(f"- {len(snr_values)} SNR values per class")
print(f"- 4096 frames per class-SNR combination")
print(f"- Total: {len(class_names) * len(snr_values) * 4096} .npy files")
print()
print("Directory structure will be:")
print("Dataset/")
for snr in snr_values[:3]:
    print(f"  snr_{snr}/")
    for class_name in class_names[:3]:
        print(f"    {class_name}/")
        print(f"      frame_0.npy")
        print(f"      frame_1.npy")
        print(f"      ...")
        print(f"      frame_4095.npy")
    print("    ...")
print("  ...")
print()
print("Each .npy file contains:")
print("- 2x1024 numpy array")
print("- Row 0: I component (1024 samples)")
print("- Row 1: Q component (1024 samples)")
print("- Data type: float32")
print("- No metadata - pure signal data only")

To run the full extraction, run the line below.
This will create:
- 24 modulation classes
- 26 SNR values per class
- 4096 frames per class-SNR combination
- Total: 2555904 .npy files

Directory structure will be:
Dataset/
  snr_-20/
    OOK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    4ASK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    8ASK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    ...
  snr_-18/
    OOK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    4ASK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    8ASK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    ...
  snr_-16/
    OOK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    4ASK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    8ASK/
      frame_0.npy
      frame_1.npy
      ...
      frame_4095.npy
    ...
  ...

Each .npy file contains

In [33]:
# Full extraction - WARNING: This will create 2,555,904 files and may take several hours!
extract_and_save_frames()

Starting frame extraction...
Processing class 1/24: OOK
  Completed SNR -20 dB: 4096 frames saved
  Completed SNR -20 dB: 4096 frames saved
  Completed SNR -18 dB: 4096 frames saved
  Completed SNR -18 dB: 4096 frames saved
Processed 10000/2555904 frames (0.4%)
Processed 10000/2555904 frames (0.4%)
  Completed SNR -16 dB: 4096 frames saved
  Completed SNR -16 dB: 4096 frames saved
  Completed SNR -14 dB: 4096 frames saved
  Completed SNR -14 dB: 4096 frames saved
Processed 20000/2555904 frames (0.8%)
  Completed SNR -12 dB: 4096 frames saved
Processed 20000/2555904 frames (0.8%)
  Completed SNR -12 dB: 4096 frames saved
  Completed SNR -10 dB: 4096 frames saved
  Completed SNR -10 dB: 4096 frames saved
  Completed SNR -8 dB: 4096 frames saved
  Completed SNR -8 dB: 4096 frames saved
Processed 30000/2555904 frames (1.2%)
Processed 30000/2555904 frames (1.2%)
  Completed SNR -6 dB: 4096 frames saved
  Completed SNR -6 dB: 4096 frames saved
  Completed SNR -4 dB: 4096 frames saved
  Compl