In [22]:
import numpy as np
import scipy.io
import os
import json
import xmltodict
import h5py
import requests

In [23]:
MIN_FRAMES = 500
FRAMES_TRAIN = 500
FRAMES_TEST = 100
PKT_LEN = 400

ROOT_DIR = '/Users/stepanmazokha/Desktop/wisig_frames_rffi_dataset'
NODE_DIR = '/node1-1_wifi_2021_03_08'

DIR_SOURCE = ROOT_DIR + NODE_DIR + '/equalized_packets_min500frames/'
FILE_TARGET_NON_EQ_TRAIN = ROOT_DIR + NODE_DIR +'/node1-1_non_eq_train.h5'
FILE_TARGET_EQ_TRAIN = ROOT_DIR + NODE_DIR + '/node1-1_eq_train.h5'
FILE_TARGET_NON_EQ_TEST = ROOT_DIR + NODE_DIR + '/node1-1_non_eq_test.h5'
FILE_TARGET_EQ_TEST = ROOT_DIR + NODE_DIR + '/node1-1_eq_test.h5'
ORBIT_DEVICE_INFO = ROOT_DIR + '/orbit_device_info.json'

In [25]:
def get_dataset_nodes(dir_source):
    # Retrieves a list of node names which we have ready for H5 packaging
    return [fname[8:-4] for fname in os.listdir(dir_source)]

def get_orbit_node_capabilities(node_id, show = False):
    url = f"https://www.orbit-lab.org/cPanel/status/getNodeCapabilities?node=node{node_id}.grid.orbit-lab.org"
    headers = {
        "Accept": "*/*",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9,uk-UA;q=0.8,uk;q=0.7,ru;q=0.6",
        "Authorization": "Basic c21hem9raGE6LWkyMXB4OHR5cg==",
        "Connection": "keep-alive",
        "Cookie": "trac_form_token=39202d14196f94e14ee8fca3; trac_auth=6865493b9d6768ff121dbaeba46347f5",
        "Host": "www.orbit-lab.org",
        "Referer": "https://www.orbit-lab.org/cPanel/status/template/index.html",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest",
        "sec-ch-ua": "\"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"",
        "sec-ch-ua-mobile": "?0"
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        responseJson = xmltodict.parse(response.text)
        if show: print(json.dumps(responseJson, indent=4))
        return responseJson
    else: return None

def save_dict_to_json_file(dictionary, file_path):
    with open(file_path, 'w') as json_file:
        json.dump(dictionary, json_file, indent=4)

def read_json_file_to_dict(file_path):
    with open(file_path, 'r') as json_file:
        dictionary = json.load(json_file)
    return dictionary

def contains_allowed_substring(input_string, allowed_substrings):
    for substring in allowed_substrings:
        if substring in input_string: return True
    return False

def get_orbit_node_infos(node_list, file_path):
    node_infos = {}

    for node_id in node_list:
        print("Processing", node_id)
        node_info = get_orbit_node_capabilities(node_id)

        if node_info is None:
            print(node_id, ': nothing found')
        else:
            node_infos[node_id] = node_info['response']['action']['devices']['device']
        
    save_dict_to_json_file(node_infos, file_path)

def filter_nodes_by_device_model(dir_node_list, node_infos):
    # Paper mentions that they were using Atheros 5212, 9220, 9280, and 9580 WiFi cards
    # We need to find the largest number of nodes (for which we have sufficient data)
    # with ONE of these cards on board (remember: we need the same hardware vendor for 
    # better model performance)
    #
    # After some experimentation, turns out that 5212 card is most common (47 devices w 500 frame limit)
    #
    # Additionally, card 5212 has one device. 
    # 
    # Also, uniqueness of the vendor/model can be identified using the @INV_dev_id field.

    device_types_allowed = ['5212']

    node_list_filtered = []
    for node_id in node_infos:
        if not dir_node_list.__contains__(node_id):
            print('Such node is not present in the directory.')
            continue

        node_info = node_infos[node_id]

        node_fit_devices = 0
        for device in node_info:
            device_id = device.get('@INV_dev_id')
            device_type = device.get("@INV_dev_type")
            device_name = device.get('@name')
        
            if contains_allowed_substring(device_type, device_types_allowed):
                # print('[', device_id, ']:', node_id, ':', device_name, '(', device_type, ')')
                node_fit_devices = node_fit_devices + 1

        if node_fit_devices == 0:
            print(node_id, ':', '5212 NOTHING FOUND')
        elif node_fit_devices >= 1:
            node_list_filtered.append(node_id)

    print('Nodes with Atheros 5212 WiFi card found:', len(node_list_filtered))

    return node_list_filtered

dir_node_list = get_dataset_nodes(DIR_SOURCE)
# get_orbit_node_infos(dir_node_list, file_path=ORBIT_DEVICE_INFO)
node_infos = read_json_file_to_dict(file_path=ORBIT_DEVICE_INFO)
node_list_filtered = filter_nodes_by_device_model(dir_node_list, node_infos)

Such node is not present in the directory.
Such node is not present in the directory.
Such node is not present in the directory.
Such node is not present in the directory.
Such node is not present in the directory.
Such node is not present in the directory.
20-7 : 5212 NOTHING FOUND
Such node is not present in the directory.
Nodes with Atheros 5212 WiFi card found: 40


In [26]:
len(node_list_filtered)

40

In [27]:
def save_dataset_h5(file_target, label, data):
    print('Saving', file_target)
    with h5py.File(file_target, 'w') as h5file:
        h5file.create_dataset('label', data=label, dtype='float64')
        h5file.create_dataset('data', data=data, dtype='float64')        

def package_dataset_h5(node_names, dir_source, frame_count, sample_count):
    L = len(node_names)

    h5data_non_eq = np.zeros((L * frame_count, sample_count * 2), dtype='float64')
    h5data_eq = np.zeros((L * frame_count, sample_count * 2), dtype='float64')
    h5labels = np.zeros((L * frame_count, 1), dtype='float64')

    h5_idx = 0
    for node_idx in np.arange(len(node_names)):
        node_name = node_names[node_idx]

        print("Processing", node_name)

        f = scipy.io.loadmat(dir_source + 'packets_' + node_name, verify_compressed_data_integrity=False)
        
        # Retrieve the list of frames; each item is a cell, containing two vectors: non-eq & eq IQ samples
        frames = f['packet_log'][0]

        if len(frames) < frame_count:
            print('Not enough frames for ', node_name)
            continue

        for frame_idx in np.arange(frame_count):
            iq_non_eq = frames[frame_idx][0:frame_count, 0] # non-equalized
            iq_eq = frames[frame_idx][0:frame_count, 1] # equalized

            h5data_non_eq[h5_idx, 0::2] = np.real(iq_non_eq)
            h5data_non_eq[h5_idx, 1::2] = np.imag(iq_non_eq)

            h5data_eq[h5_idx, 0::2] = np.real(iq_eq)
            h5data_eq[h5_idx, 1::2] = np.imag(iq_eq)

            h5labels[h5_idx] = node_idx
            
            h5_idx = h5_idx + 1

    return [h5data_non_eq, h5data_eq, h5labels]

def process_save_rx(node_list_filtered, dir_source, frame_count, sample_count, file_target_non_eq_train, file_target_eq_train, file_target_non_eq_class, file_target_eq_class):
    nodes_train = node_list_filtered[0:30] # use first 30 devices for training the model
    nodes_class = node_list_filtered[30:] # use second 30 devices for testing the model

    [h5data_non_eq_train, h5data_eq_train, h5labels_train] = package_dataset_h5(nodes_train, dir_source, frame_count, sample_count)
    [h5data_non_eq_class, h5data_eq_class, h5labels_class] = package_dataset_h5(nodes_class, dir_source, frame_count, sample_count)

    # Save to H5
    save_dataset_h5(file_target=file_target_non_eq_train, label=h5labels_train, data=h5data_non_eq_train)
    save_dataset_h5(file_target=file_target_eq_train, label=h5labels_train, data=h5data_eq_train)

    save_dataset_h5(file_target=file_target_non_eq_class, label=h5labels_class, data=h5data_non_eq_class)
    save_dataset_h5(file_target=file_target_eq_class, label=h5labels_class, data=h5data_eq_class)

process_save_rx(node_list_filtered, DIR_SOURCE, MIN_FRAMES, PKT_LEN, FILE_TARGET_NON_EQ_TRAIN, FILE_TARGET_EQ_TRAIN, FILE_TARGET_NON_EQ_TEST, FILE_TARGET_EQ_TEST)

Processing 19-19
Processing 14-7
Processing 10-17
Processing 16-1
Processing 10-11
Processing 13-3
Processing 8-3
Processing 16-16
Processing 1-18
Processing 20-15
Processing 14-10
Processing 11-7
Processing 6-15
Processing 11-4
Processing 4-10
Processing 3-18
Processing 15-1
Processing 20-12
Processing 8-20
Processing 1-10
Processing 19-1
Processing 11-17
Processing 8-8
Processing 1-12
Processing 1-16
Processing 4-1
Processing 20-19
Processing 3-13
Processing 2-6
Processing 20-1
Processing 8-18
Processing 5-5
Processing 7-11
Processing 7-10
Processing 7-14
Processing 2-19
Processing 5-1
Processing 17-10
Processing 12-20
Processing 17-11
Saving /Users/stepanmazokha/Desktop/wisig_frames_rffi_dataset/node1-1_wifi_2021_03_08/node1-1_non_eq_train.h5
Saving /Users/stepanmazokha/Desktop/wisig_frames_rffi_dataset/node1-1_wifi_2021_03_08/node1-1_eq_train.h5
Saving /Users/stepanmazokha/Desktop/wisig_frames_rffi_dataset/node1-1_wifi_2021_03_08/node1-1_non_eq_test.h5
Saving /Users/stepanmazokha/D

In [186]:
def read_dataset_h5(file_target, dataset_name):
    with h5py.File(file_target, 'r') as h5file:
        dataset = h5file[dataset_name][:]
    return dataset

read_dataset_h5(FILE_TARGET_EQ_TEST, 'data').shape

(8500, 800)

In [5]:
def image_size_calc(L, N):
    # In STFT, with 50% overlap, we've got the following formula to determine size of the image:
    # L = N + (N/2) * (M-1), where:
    # - L: # of samples in the preamble (Shen had 8192)
    # - N: nfft (Shen had it as 256)
    # - M: # of windows
    # Therefore, to estimate M we'll have the formula: 
    # M = 1 + (2/N) * (L - N)
    return int(1 + (2/N) * (L - N))

image_size_calc(8192, 256)

63

In [9]:
int(np.floor((8192-256)/128 + 1) - 1)

62