### Preparing Training Dataset

The main purpouse of this notebook is to generate the pickle file containing training dataset we use as an input in our main training script!

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import seaborn as sns

import pyarrow as pa
import pyarrow.feather as feather
import pyarrow.parquet as pq
#import fastparquet as fp
import tensorflow as tf
# Set the URL to data
data_path = './ITU_dataset/'             ## Path to where data is stored
output_train_sim = data_path

The training files (output file) include the information of each simulation as follows:

1. Header line, indicating the name of the input file used for the simulation (the name of the input file contains the OBSS/PD-based threshold used in each case)

2. Array with the throughput (in Mbps) obtained by each STA of the BSS of interest

3. Array with the interference (in dBm) sensed by the AP of interest, from all the other APs.

4. Array with the RSSI (in dBm) received by each STA of the BSS of interest, from its corresponding AP.

5. Array with the average SINR (in dB) experienced by each STA of the BSS of interest.


In [2]:
# First we can check how the output files looks depending on the scenario

output_file_root_name = 'output_11ax_sr_simulations'
column_names = ["input_file_name", "throughput", "interference", "rssi", "sinr"]


# First we put the data from one experiment into pandas dataframe
output_data_scene1_raw = pd.read_csv(data_path + output_file_root_name + '_sce1.txt', sep="\n", names=["raw_data"])
output_data_scene1 = pd.DataFrame()
for x in range(len(column_names)):
    output_data_scene1[column_names[x]] = output_data_scene1_raw[(output_data_scene1_raw.index + x) % 5== 0].reset_index(drop=True)

# Clean the dataset and convert values to float
output_data_scene1["throughput"] = output_data_scene1["throughput"].str.replace('f','')

output_data_scene1["throughput"] = pd.to_numeric(output_data_scene1["throughput"], downcast="float")
output_data_scene1["interference"] = pd.to_numeric(output_data_scene1["interference"], downcast="float")
output_data_scene1["sinr"] = pd.to_numeric(output_data_scene1["sinr"], downcast="float")

for y in range(len(output_data_scene1['rssi'])):
    output_data_scene1.at[y, 'rssi'] = [float(x) for x in output_data_scene1.at[y, 'rssi'].split(',')]

# First we put the data from second experiment into pandas dataframe
output_data_scene2_raw = pd.read_csv(data_path + output_file_root_name + '_sce2.txt', sep="\n", names=["raw_data"], encoding = "latin-1")
output_data_scene2 = pd.DataFrame()
for x in range(len(column_names)):
    output_data_scene2[column_names[x]] = output_data_scene2_raw[(output_data_scene2_raw.index + x) % 5== 0].reset_index(drop=True)

# Convert values to float
# output_data_scene2["throughput"] = output_data_scene2["throughput"].str.replace('«¤Ð\x16d\x9c\x0235.05','0')

list_of_weird_datapoints = []
for y in range(len(output_data_scene2['throughput'])):
    try:
        output_data_scene2.at[y, 'throughput'] = [float(x) for x in output_data_scene2.at[y, 'throughput'].split(',')]
        output_data_scene2.at[y, 'interference'] = [float(x) for x in output_data_scene2.at[y, 'interference'].split(',')]
        output_data_scene2.at[y, 'rssi'] = [float(x) for x in output_data_scene2.at[y, 'rssi'].split(',')]
        output_data_scene2.at[y, 'sinr'] = [float(x) for x in output_data_scene2.at[y, 'sinr'].split(',')]
    except:
        print("problematic index is:", y)
        list_of_weird_datapoints.append(y)

# Drop them:
output_data_scene2 = output_data_scene2.drop(list_of_weird_datapoints)
output_data_scene2 = output_data_scene2.reset_index(drop=True)
        
#output_data_scene2["interference"] = pd.to_numeric(output_data_scene2["interference"], downcast="float")
#output_data_scene2["sinr"] = pd.to_numeric(output_data_scene2["sinr"], downcast="float")

output_data_scene1
output_data_scene2



# First we put the data from second experiment into pandas dataframe
output_data_scene3_raw = pd.read_csv(data_path + output_file_root_name + '_sce3.txt', sep="\n", names=["raw_data"], encoding = "latin-1")
output_data_scene3 = pd.DataFrame()
for x in range(len(column_names)):
    output_data_scene3[column_names[x]] = output_data_scene3_raw[(output_data_scene3_raw.index + x) % 5== 0].reset_index(drop=True)

# Convert values to float
#output_data_scene3["throughput"] = output_data_scene2["throughput"].str.replace('«¤Ð\x16d\x9c\x0235.05','0')

list_of_weird_datapoints = []
for y in range(len(output_data_scene3['throughput'])):
    try:
        output_data_scene3.at[y, 'throughput'] = [float(x) for x in output_data_scene3.at[y, 'throughput'].split(',')]
        output_data_scene3.at[y, 'interference'] = [float(x) for x in output_data_scene3.at[y, 'interference'].split(',')]
        output_data_scene3.at[y, 'rssi'] = [float(x) for x in output_data_scene3.at[y, 'rssi'].split(',')]
        output_data_scene3.at[y, 'sinr'] = [float(x) for x in output_data_scene3.at[y, 'sinr'].split(',')]
    except:
        print("problematic index is:", y)
        list_of_weird_datapoints.append(y)

# Drop them:
#output_data_scene2 = output_data_scene2.drop(list_of_weird_datapoints)
#output_data_scene2 = output_data_scene2.reset_index(drop=True)
        
#output_data_scene2["interference"] = pd.to_numeric(output_data_scene2["interference"], downcast="float")
#output_data_scene2["sinr"] = pd.to_numeric(output_data_scene2["sinr"], downcast="float")

output_data_scene1
output_data_scene2
output_data_scene3

Unnamed: 0,input_file_name,throughput,interference,rssi,sinr
0,KOMONDOR SIMULATION 'sim_input_nodes_s000_v00...,"[34.24, 32.23, 32.68, 35.94]","[-57.27, -58.59, -52.75, -54.78]","[-65.0, -59.98, -91.26, -44.8]","[5.79, 7.02, 7.67, 6.2]"
1,KOMONDOR SIMULATION 'sim_input_nodes_s000_v00...,"[34.26, 32.25, 32.69, 35.87]","[-57.27, -58.59, -52.75, -54.78]","[-65.0, -59.98, -91.26, -44.8]","[5.65, 7.57, 8.01, 5.95]"
2,KOMONDOR SIMULATION 'sim_input_nodes_s000_v00...,"[34.27, 32.25, 32.69, 35.89]","[-57.27, -58.59, -52.75, -54.78]","[-65.0, -59.98, -91.26, -44.8]","[5.79, 7.43, 8.01, 6.0]"
3,KOMONDOR SIMULATION 'sim_input_nodes_s000_v00...,"[34.23, 32.21, 32.63, 35.98]","[-57.27, -58.59, -52.75, -54.78]","[-65.0, -59.98, -91.26, -44.8]","[5.17, 6.98, 8.52, 6.65]"
4,KOMONDOR SIMULATION 'sim_input_nodes_s000_v00...,"[34.36, 32.24, 32.59, 35.94]","[-57.27, -58.59, -52.75, -54.78]","[-65.0, -59.98, -91.26, -44.8]","[6.2, 6.66, 7.84, 6.0]"
...,...,...,...,...,...
188575,KOMONDOR SIMULATION 'sim_input_nodes_s999_v11...,"[35.52, 47.34]","[-52.33, -40.88]","[-93.56, -97.47, -85.51, -79.08]","[47.55, 55.97]"
188576,KOMONDOR SIMULATION 'sim_input_nodes_s999_v11...,"[35.55, 47.28]","[-52.33, -40.88]","[-93.56, -97.47, -85.51, -79.08]","[50.77, 52.47]"
188577,KOMONDOR SIMULATION 'sim_input_nodes_s999_v11...,"[35.57, 47.27]","[-52.33, -40.88]","[-93.56, -97.47, -85.51, -79.08]","[49.52, 53.42]"
188578,KOMONDOR SIMULATION 'sim_input_nodes_s999_v11...,"[35.66, 47.34]","[-52.33, -40.88]","[-93.56, -97.47, -85.51, -79.08]","[48.22, 54.06]"


In [3]:
output_data_scene1

Unnamed: 0,input_file_name,throughput,interference,rssi,sinr
0,KOMONDOR SIMULATION 'sim_input_nodes_s0000_c-...,33.380001,-78.989998,"[-70.51, -77.78, -70.43, -119.21]",1.480000
1,KOMONDOR SIMULATION 'sim_input_nodes_s0000_c-...,33.380001,-77.989998,"[-70.51, -77.78, -70.43, -119.21]",1.480000
2,KOMONDOR SIMULATION 'sim_input_nodes_s0000_c-...,33.380001,-76.989998,"[-70.51, -77.78, -70.43, -119.21]",1.480000
3,KOMONDOR SIMULATION 'sim_input_nodes_s0000_c-...,33.380001,-75.989998,"[-70.51, -77.78, -70.43, -119.21]",1.480000
4,KOMONDOR SIMULATION 'sim_input_nodes_s0000_c-...,33.389999,-74.989998,"[-70.51, -77.78, -70.43, -119.21]",1.480000
...,...,...,...,...,...
20995,KOMONDOR SIMULATION 'sim_input_nodes_s0999_c-...,50.349998,-41.459999,"[-107.02, -84.46]",110.349998
20996,KOMONDOR SIMULATION 'sim_input_nodes_s0999_c-...,50.349998,-41.459999,"[-107.02, -84.46]",110.349998
20997,KOMONDOR SIMULATION 'sim_input_nodes_s0999_c-...,50.349998,-41.459999,"[-107.02, -84.46]",110.349998
20998,KOMONDOR SIMULATION 'sim_input_nodes_s0999_c-...,50.349998,-41.459999,"[-107.02, -84.46]",110.349998


In [4]:
output_file_root_name = 'output_11ax_sr_simulations'
column_names = ["input_file_name"]


# First we put the data from first scenario into pandas dataframe
data_scene1_raw = pd.read_csv(data_path + output_file_root_name + '_sce1.txt', sep="\n", names=["raw_data"])
data_scene1 = pd.DataFrame()
for x in range(len(column_names)):
    data_scene1[column_names[x]] = data_scene1_raw[(data_scene1_raw.index + x) % 5== 0].reset_index(drop=True)

# Then we put the data from second scerion into pandas dataframe
data_scene2_raw = pd.read_csv(data_path + output_file_root_name + '_sce2.txt', sep="\n", names=["raw_data"], encoding = "latin-1")
data_scene2 = pd.DataFrame()
for x in range(len(column_names)):
    data_scene2[column_names[x]] = data_scene2_raw[(data_scene2_raw.index + x) % 5== 0].reset_index(drop=True)
    

# Then we put the data from second scerion into pandas dataframe
data_scene3_raw = pd.read_csv(data_path + output_file_root_name + '_sce3.txt', sep="\n", names=["raw_data"], encoding = "latin-1")
data_scene3 = pd.DataFrame()
for x in range(len(column_names)):
    data_scene3[column_names[x]] = data_scene3_raw[(data_scene3_raw.index + x) % 5== 0].reset_index(drop=True)
    
    
# Remove the KOMONDOR SIMULATION string from colums!
data_scene1["input_file_name"] = data_scene1["input_file_name"].str.replace("KOMONDOR SIMULATION 'sim_", "")
data_scene1[['input_file_name', 'seed']] = data_scene1['input_file_name'].str.split('(', 1, expand=True)
data_scene1['input_file_name'] = data_scene1['input_file_name'].str.replace("'", "")
data_scene1["seed"] = data_scene1["seed"].str.replace(')', '', regex=True)

data_scene2["input_file_name"] = data_scene2["input_file_name"].str.replace("KOMONDOR SIMULATION 'sim_", "")
data_scene2[['input_file_name', 'seed']] = data_scene2['input_file_name'].str.split('(', 1, expand=True)
data_scene2['input_file_name'] = data_scene2['input_file_name'].str.replace("'", "")
data_scene2["seed"] = data_scene2["seed"].str.replace(')', '', regex=True)


data_scene3["input_file_name"] = data_scene3["input_file_name"].str.replace("KOMONDOR SIMULATION 'sim_", "")
data_scene3[['input_file_name', 'seed']] = data_scene3['input_file_name'].str.split('(', 1, expand=True)
data_scene3['input_file_name'] = data_scene3['input_file_name'].str.replace("'", "")
data_scene3["seed"] = data_scene3["seed"].str.replace(')', '', regex=True)

#output_data_scene2.iloc[19446]

In [5]:
# Split the data based on the context and OBSS/PD information to the dataset
data_scene1["context"] = data_scene1["input_file_name"].str.replace("input_nodes_s", "")
data_scene1["context"] = data_scene1["context"].str[:5].astype(int)
data_scene1["OBSS_PD"] = data_scene1["input_file_name"].str[-7:-5].astype(int)
data_scene1


# Split the data based on the context
data_scene2["context"] = data_scene2["input_file_name"].str[-14:-10].astype(int)  - 1000 # to get context between 0 and 2999
data_scene2["OBSS_PD"] = data_scene2["input_file_name"].str[-7:-5].astype(int) 
#data_scene2

# Split the data based on the context
data_scene3["context"] = data_scene3["input_file_name"].str.replace("input_nodes_s", "")
data_scene3["context"] = data_scene3["context"].str.split(pat="_") 
list_of_context = [] # there was some bug in reading the files...
for row in  data_scene3["context"]:
    list_of_context.append(int(row[0]) + 1000) # to get context between 0 and 2999
data_scene3["context"] = list_of_context
data_scene3["OBSS_PD"] = data_scene3["input_file_name"].str[-7:-5].astype(int)
#data_scene3

In [6]:
print(data_scene1["context"].min(),
data_scene1["context"].max(),
data_scene2["context"].min(),
data_scene2["context"].max(),
data_scene3["context"].min(),
data_scene3["context"].max(),)

0 999 2000 2999 1000 1999


In [7]:
# Helper Functions for reading the dataset
def read_output_simulator(fp, dataset_lenght):       
    RSSI_list = [] 
    SINR_list = [] 
    interference_list = [] 
    throughput_list = []
    # To manually fix bugs in the dataset
    RSSI_bugs = [np.array([-80.77,-81.19,-65.59]), np.array([-61.77,-62.19,-46.59]), np.array([-61.77,-62.19,-46.59]),
                np.array([-61.77,-62.19,-46.59]), np.array([-61.77,-62.19,-46.59]), np.array([-61.77,-62.19,-46.59]), 
                np.array([-61.77,-62.19,-46.59]), np.array([-61.77,-62.19,-46.59])] 
    RSSI_bug_index = 0
    
     # To manually fix bugs in the dataset
    SINR_bugs = [np.array([35.05, 33.56]), np.array([35.16, 30.83]), np.array([33.95, 34.76, 39.32]), np.array([32.91,31.65,47.57]),
                np.array([32.89,31.68,47.43]), np.array([32.92,31.72,47.38]), np.array([32.69,31.58,47.12]) , np.array([32.73,31.56,47.44]), 
                np.array([32.48,31.67,47.61]), np.array([32.74,31.67,47.27]), np.array([32.74,31.61,47.59]), np.array([32.91,31.67,47.61])]
    SINR_bug_index = 0
    
    fp = fp.readlines()
    line_index = 0 
    for index in range(dataset_lenght):
        try:
            line = fp[line_index]                  # Initial line (name of the scenaio)
            line_index += 1
        except:
            print("The problematic index is", line_index)
            line_index += 1
        #print("reading...",line)
        # Throughput 
        throughput = fp[line_index]                 # Throughput
        line_index += 1
        throughput = throughput.strip()         # Remove \n ch
        if "," in throughput:
            throughput = np.array(throughput[0:len(throughput)].split(',')).astype(np.float)
            throughput_list.append(throughput)
        else:
            try:
                throughput = float(throughput)
            except:
                print("There is an throughput data bug at line ", line_index, "raw data is",  throughput)
                print(throughput)
                throughput = 0.0
            throughput_list.append(throughput)
        
        # Interferences
        interference = fp[line_index]             # Interferences
        interference = interference.strip()     # Remove \n ch
        line_index += 1
        if "," in interference:
            interference = np.array(interference[0:len(interference)].split(',')).astype(np.float)
            interference_list.append(interference)
        else:
            interference = float(interference)
            interference_list.append(interference)
        
        # RSSI
        RSSI = fp[line_index]           # RSSI
        RSSI = RSSI.strip()         # Remove \n ch
        line_index += 1
        if "," in RSSI:
            try:
                RSSI = np.array(RSSI[0:len(RSSI)].split(',')).astype(np.float)
            except:
                print("There is an RSSI data bug at line ",line_index, "raw data is",  RSSI)
                RSSI = RSSI_bugs[RSSI_bug_index]
                RSSI_bug_index += 1
            RSSI_list.append(RSSI)
        else:
            RSSI = float(RSSI)
            RSSI_list.append(RSSI)
            
        # SINR
        SINR = fp[line_index]          # SINR
        SINR = SINR.strip()         # Remove \n ch
        line_index += 1
        
        if "," in SINR:
            #SINR.replace('«¤Ð', '')
            #SINR = np.array(SINR[0:len(SINR)].split(',')).astype(np.float)
            #SINR_list.append(SINR)
            try:
                SINR = np.array(SINR[0:len(SINR)].split(',')).astype(np.float)
            except:
                print("There is an SINR data bug at line ",line_index, "raw data is",  SINR)
                SINR = SINR_bugs[SINR_bug_index] # Push in manual corrections
                SINR_bug_index += 1  
            SINR_list.append(SINR)
        else:
            SINR = SINR.replace('f', '')  # As there is a bug in the dataset
            SINR = float(SINR)
            SINR_list.append(SINR)

    return( RSSI_list, SINR_list, interference_list, throughput_list)


def read_input_files(input_dataset_path, input_dataset_names_list):
    """
    A functions that returns input information, currently we only append the data we think is required...
    
    
    All options are below:
    
    ['node_code', 'node_type', 'wlan_code', 'x(m)', 'y(m)', 'z(m)', 'central_freq(GHz)', 
    'channel_bonding_model', 'primary_channel', 'min_channel_allowed', 'max_channel_allowed', 
    'tpc_default(dBm)', 'cca_default(dBm)', 'traffic_model', 'traffic_load[pkt/s]',
    'packet_length', 'num_packets_aggregated', 'capture_effect_model',
    'capture_effect_thr', 'constant_per', 'pifs_activated', 'cw_adaptation',
    'cont_wind', 'cont_wind_stage', 'bss_color', 'spatial_reuse_group',
    'non_srg_obss_pd', 'srg_obss_pd']
    
    """

    list_node_type = []
    x_pos_list = []
    y_pos_list = []
    
    for dataset_name in input_dataset_names_list:
        df  = pd.read_csv(input_dataset_path + dataset_name[1:-1], sep = ';', usecols=['node_type', 'x(m)', 'y(m)'])
        list_node_type.append(df['node_type'].tolist())
        x_pos_list.append(df['x(m)'].tolist())
        y_pos_list.append(df['y(m)'].tolist())
        # Add what you need in this line...
        

    return list_node_type, x_pos_list, y_pos_list

In [8]:
# ADD RSSI, Interference, SNR, and Throughput to the dataframe
output_train_sim_sce1 = output_train_sim+'output_11ax_sr_simulations_sce1.txt'
output_train_sim_sce2 = output_train_sim+'output_11ax_sr_simulations_sce2.txt'
output_train_sim_sce3 = output_train_sim+'output_11ax_sr_simulations_sce3.txt'


fp1 = open(output_train_sim_sce1, 'r')
RSSI, SINR, interference, throughput = read_output_simulator(fp1, len(data_scene1))
data_scene1["RSSI"] = RSSI
data_scene1["SINR"] = SINR
data_scene1["interference"] = interference
data_scene1["throughput"] = throughput


fp2 = open(output_train_sim_sce2, 'r', encoding = "latin-1")
RSSI, SINR, interference, throughput = read_output_simulator(fp2, len(data_scene2))
data_scene2["RSSI"] = RSSI
data_scene2["SINR"] = SINR
data_scene2["interference"] = interference
data_scene2["throughput"] = throughput


fp3 = open(output_train_sim_sce3, 'r', encoding = "latin-1")
RSSI, SINR, interference, throughput = read_output_simulator(fp3, len(data_scene3))
data_scene3["RSSI"] = RSSI
data_scene3["SINR"] = SINR
data_scene3["interference"] = interference
data_scene3["throughput"] = throughput

In [9]:
# Now add the input information to the dataframe as will be used, note this takes a while...try to make it more efficient
input_dataset_path1 = data_path+'simulator_input_files_sce1/'
input_dataset_path2 = data_path+'simulator_input_files_sce2/'
input_dataset_path3 = data_path+'simulator_input_files_sce3/'

node_types, x_positions, y_postions = read_input_files(input_dataset_path1, data_scene1['input_file_name'].tolist())
data_scene1["node_type"] = node_types
data_scene1["x(m)"] = x_positions
data_scene1["y(m)"] = y_postions

node_types, x_positions, y_postions = read_input_files(input_dataset_path2, data_scene2['input_file_name'].tolist())
data_scene2["node_type"] = node_types
data_scene2["x(m)"] = x_positions
data_scene2["y(m)"] = y_postions


node_types, x_positions, y_postions = read_input_files(input_dataset_path3, data_scene3['input_file_name'].tolist())
data_scene3["node_type"] = node_types
data_scene3["x(m)"] = x_positions
data_scene3["y(m)"] = y_postions

In [10]:
# Combine the dataset and save it to a file using pickle (to preserve the original datatypes) extension
entire_dataset = pd.concat([data_scene1, data_scene2, data_scene3], ignore_index=True)
entire_dataset
#save to csv file
#entire_dataset.to_csv('cleaned_dataset/train.csv', index=False)

#pq.write_table(pa.Table.from_pandas(entire_dataset), 'cleaned_dataset/train.parquet')
entire_dataset.to_pickle('cleaned_dataset/train.pkl')

In [11]:
# An example on how to read the .pikle file
#dataset_train = pd.read_csv('cleaned_dataset/train.csv')
dataset_train = pd.read_pickle('cleaned_dataset/train.pkl')

In [18]:
len(entire_dataset.loc[entire_dataset['context'] == 1191])

0

In [23]:
list_of_context_that_do_not_exist = []
for i in range(3000):
    if len(entire_dataset.loc[entire_dataset['context'] == i])  < 1:
        #print(i)
        list_of_context_that_do_not_exist.append(i)
print(list_of_context_that_do_not_exist, len(list_of_context_that_do_not_exist))

[1004, 1025, 1116, 1124, 1142, 1163, 1185, 1191, 1205, 1208, 1209, 1224, 1236, 1242, 1260, 1282, 1295, 1302, 1312, 1318, 1324, 1341, 1348, 1373, 1405, 1407, 1422, 1471, 1485, 1503, 1514, 1520, 1590, 1601, 1640, 1670, 1681, 1693, 1730, 1742, 1766, 1771, 1792, 1810, 1811, 1818, 1842, 1856, 1880, 1903, 1931, 1943, 1966, 1990] 54
