In [1]:
import os
import sys
import numpy as np
import re
import h5py
import json
from collections import defaultdict
import threading
import boto3
import matlab.engine
from dotenv import load_dotenv

load_dotenv()

s3 = boto3.client('s3')

In [2]:
MATLAB_OFDM_DECODER = '/Users/stepanmazokha/Desktop/mobintel-rffi/preprocessor/frame_mac_detection'
TEMP_IQ_DIRECTORY = '/Users/stepanmazokha/Desktop/orbit_processor_temp/'
NODE_MACS = '/Users/stepanmazokha/Desktop/orbit_device_macs.json'

S3_BUCKET_NAME = "mobintel-orbit-dataset"
S3_EXPERIMENT_NAME = "orbit_experiment_jul_19"
S3_EPOCH_PREFIX = "epoch_"
S3_TRAINING_PREFIX = "training_"

RFFI_DATASET_TARGET_DIR = f'/Users/stepanmazokha/Desktop/{S3_BUCKET_NAME}_h5/'

FRAME_COUNT = 400


In [161]:
# Extracts signal configs from a file name in a dataset
# - filename: name of the .dat file (without the route)
def parse_dat_name(filename):
    # Extract node_tx
    node_tx_match = re.search(r'tx\{node_(.*?)\}', filename)
    node_tx = node_tx_match.group(1) if node_tx_match else None

    # Extract node_rx
    node_rx_match = re.search(r'rx\{node_(.*?)[\+\}]', filename)
    node_rx = node_rx_match.group(1) if node_rx_match else None

    # Extract samp_rate
    samp_rate_match = re.search(r'rxSampRate_(\d+e\d+)', filename)
    samp_rate = int(float(samp_rate_match.group(1))) if samp_rate_match else None

    return {
        "node_tx": node_tx,
        "node_rx": node_rx,
        "samp_rate": samp_rate
    }

In [5]:
mateng = matlab.engine.start_matlab()

In [4]:
mateng.quit()

In [162]:
# Reads a JSON file containing MAC addresses of devices
def get_device_macs(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    return data

In [163]:
class ProgressPercentage:
    def __init__(self, filename, total_size):
        self._filename = filename
        self._total_size = total_size
        self._seen_so_far = 0
        self._lock = threading.Lock()

    def __call__(self, bytes_amount):
        # To simplify, assume this is hooked up to a single filename
        with self._lock:
            self._seen_so_far += bytes_amount
            percentage = (self._seen_so_far / self._total_size) * 100
            mb_downloaded = self._seen_so_far / (1024 * 1024)
            mb_total = self._total_size / (1024 * 1024)
            sys.stdout.write(
                "\r%s  %.2f%% (%.2f MB of %.2f MB)" % (self._filename, percentage, mb_downloaded, mb_total))
            sys.stdout.flush()

def download_file_with_progress(bucket_name, s3_key, local_path):
    s3 = boto3.client('s3')

    # Get the total size of the object
    response = s3.head_object(Bucket=bucket_name, Key=s3_key)
    total_size = response['ContentLength']

    # Ensure the local directory exists
    local_dir = os.path.dirname(local_path)
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)

    # Start the download and show the progress
    s3.download_file(bucket_name, s3_key, local_path, Callback=ProgressPercentage(local_path, total_size))
    
def s3_list_subdirs(bucket_name, prefix):
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix, Delimiter='/')
    
    subdirs = []
    for path in response['CommonPrefixes']:
        subdirs.append(os.path.basename(os.path.normpath(path['Prefix'])))
    return subdirs

def s3_list_files(bucket_name, prefix):
    # Initialize the paginator
    paginator = s3.get_paginator('list_objects_v2')

    # Create a PageIterator from the Paginator
    page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

    # List to store all file keys
    filenames = []

    # Iterate through each page
    for page in page_iterator:
        if 'Contents' in page:
            for obj in page['Contents']:
                filename = os.path.basename(os.path.normpath(obj['Key']))
                if filename[-4:] == '.dat':
                    filenames.append(filename)

    return filenames


In [164]:
# Create a dictionary which contains IDs (1--400) and (X-Y) of all sensors physically present
# in the Orbit testbed facility. This is later used to produce unique labels for the sensor 
# fingerprinting model.
def generate_node_ids():
    ids = {}
    node_i = 0
    for i in np.arange(1, 21):
        for j in np.arange(1, 21):
            ids[str(i) + "-" + str(j)] = node_i
            node_i = node_i + 1
    return ids

# Save an h5 dataset file containing labels & data for a given set of devices
def save_dataset_h5(file_target, label, data):
    print('Saving', file_target)
    with h5py.File(file_target, 'w') as h5file:
        h5file.create_dataset('label', data=label, dtype='float64')
        h5file.create_dataset('data', data=data, dtype='float64')  

# Package & store epoch infromation in h5 file (ready for RFFI)
def epoch_save(node_ids_dict, target_dir, epoch_preambles):
    for rx_name in epoch_preambles.keys():
        rx_epochs = epoch_preambles[rx_name]

        # Data shape: (epochs x frames, samples x 2)
        # All frames/samples from all emitters are stitched together
        h5_data = np.zeros((len(rx_epochs) * FRAME_COUNT, preamble_len * 2), dtype='float64')
        # Labels shape: (epochs x frames, 1)
        h5_labels = np.zeros((len(rx_epochs) * FRAME_COUNT, 1), dtype='float64')

        h5_idx = 0
        for rx_epoch in rx_epochs:
            preambles = rx_epoch['preambles']
            tx_node_name = rx_epoch['node_tx']
            for preamble_i in np.arange(0, preambles.shape[0]):
                h5_data[h5_idx, 0::2] = np.real(preambles[preamble_i, :])
                h5_data[h5_idx, 1::2] = np.imag(preambles[preamble_i, :])

                h5_labels[h5_idx] = node_ids_dict[tx_node_name]

                h5_idx = h5_idx + 1


        dataset_filepath = os.path.join(target_dir, f'node{rx_name}_{session_type}.h5')
        save_dataset_h5(dataset_filepath, h5_labels, h5_data)

In [194]:
# 1. Check if a directory to store final dataset exists and create if not
if not os.path.exists(RFFI_DATASET_TARGET_DIR):
    os.makedirs(RFFI_DATASET_TARGET_DIR)

# 0. Set up the MATLAB environment before starting
mateng = matlab.engine.start_matlab()
mateng.cd(MATLAB_OFDM_DECODER, nargout=0)

# 1. Load a JSON file with device MAC addresses
device_macs = get_device_macs(NODE_MACS)

# 2. Generate a dictionary of node IDs
node_ids = generate_node_ids()

# 1. Obtain a list of epochs in the experiment
sessions = s3_list_subdirs(S3_BUCKET_NAME, S3_EXPERIMENT_NAME + '/')

training_sessions = []
testing_sessions = []
for session_name in sessions:
    if session_name[0:6] == 'epoch_':
        testing_sessions.append(session_name)
    elif session_name[0:9] == 'training_':
        training_sessions.append(session_name)
    else: print("Skipping session", session_name)

# 2. Process each training epoch
# TODO: for session_name in training_sessions:
session_name = "training_2024-07-20_00-50-38"
session_type = "train"

print("Processing session ", session_name)
session_dat_files = s3_list_files(S3_BUCKET_NAME, S3_EXPERIMENT_NAME + "/" + session_name + "/")

# 2.1. Prepare a dictionary to store preambles for this epoch
epoch_preambles = defaultdict(list)
rx_nodes = set()

# 3. Process each .dat file
for dat_file in session_dat_files:
    print(f"- {dat_file}")

    # 3.1. Download the file from S3
    s3_filepath = f"{S3_EXPERIMENT_NAME}/{session_name}/{dat_file}"
    local_filepath = os.path.join(TEMP_IQ_DIRECTORY, dat_file)
    print(f'Downloading {dat_file}...')
    # TODO: download_file_with_progress(S3_BUCKET_NAME, s3_filepath, local_filepath)

    # 3.2. Extract signal info from its name
    dat_config = parse_dat_name(dat_file)
    tx_name = dat_config['node_tx'][4:]
    rx_name = dat_config['node_rx'][4:]
    samp_rate = dat_config['samp_rate']
    preamble_len = 400 if samp_rate == 25e6 else 320 # samp rate can be 25e6 or 20e6

    rx_nodes.add(rx_name)

    # 3.3. Retrieve node MAC address
    tx_mac = device_macs[tx_name]['mac']

    # 3.2. Decode the file via MATLAB script, extract preambles
    response = mateng.find_tx_frames(local_filepath, 'CBW20', samp_rate, tx_mac, preamble_len)
    # preamble_bounds = np.array(response['preamble_bounds']).squeeze()
    preamble_iq = np.array(response['preamble_iq']).squeeze()

    if preamble_iq.shape[0] < FRAME_COUNT:
        print(f"Insufficient frames captured: {dat_file}")
        continue

    # 3.3. Store information from a current dat file
    epoch_preambles[rx_name].append({
        'preambles': preamble_iq[0:FRAME_COUNT, :],
        'node_tx': tx_name,
        'node_rx': rx_name,
        'node_mac': tx_mac,
        'preamble_len': preamble_len
    })

    # 3.N. Remove local file afer the processing is completed
    # TODO: print(f"Deleting local file {local_filepath}")
    # TODO: os.remove(local_filepath)

    break

epoch_save(node_ids, RFFI_DATASET_TARGET_DIR, epoch_preambles)

Skipping session silent
Skipping session silent_2024-07-21_14-19-00
Processing session  training_2024-07-20_00-50-38
- tx{node_node1-10}_rx{node_node1-1+rxFreq_2462e6+rxGain_10+capLen_2+rxSampRate_25e6}.dat
Downloading tx{node_node1-10}_rx{node_node1-1+rxFreq_2462e6+rxGain_10+capLen_2+rxSampRate_25e6}.dat...
Resampling input waveform from 25 MHz to 20 MHz
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
........................................
...........................

Saving /Users/stepanmazokha/Desktop/mobintel-orbit-dataset_h5/node1-1_train.h5


In [205]:
def read_dataset_h5(file_target, dataset_name):
    with h5py.File(file_target, 'r') as h5file:
        dataset = h5file[dataset_name][:]
    return dataset

a = read_dataset_h5('/Users/stepanmazokha/Desktop/mobintel-orbit-dataset_h5/node1-1_train.h5', 'data')

a.shape

(400, 800)