# Data Cleaning and Pre-Processing

## Libraries and Configurations

Import configuration files

In [1]:
from configparser import ConfigParser
import os

config = ConfigParser()
config.read("../../config.ini")

['../../config.ini']

Import **data libraries**

In [2]:
import pandas as pd

Import **other libraries**

In [3]:
from rich import traceback

traceback.install()

<bound method InteractiveShell.excepthook of <ipykernel.zmqshell.ZMQInteractiveShell object at 0x78132de93e10>>

## Import Data

In [4]:
# Base directory containing the folders
base_dir = config["DEFAULT"]["dissected_path"]

# Initialize an empty dictionary to store DataFrames
dataframes = {}

# Traverse the directory structure
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".csv"):
            # Construct the full file path
            file_path = os.path.join(root, file)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Store the DataFrame in the dictionary with a unique key (e.g., file name)
            dataframes[file] = df

## Combine and Clean Data

Concatenating devices' dataframes and converting *Timestamp* column

In [5]:
dissected_df = pd.concat(dataframes.values(), ignore_index=True)
dissected_df["Timestamp"] = pd.to_datetime(dissected_df["Timestamp"], unit="s")

Saving initial, concatenated and raw dataframe

In [6]:
dissected_df.to_csv(
    config["DEFAULT"]["interim_path"] + "dissected/dissected_df_raw.csv", index=False
)

Filling with `-1` the empty fields

In [7]:
dissected_df.fillna("-1", inplace=True)

  dissected_df.fillna("-1", inplace=True)


In [8]:
dissected_df

Unnamed: 0,Timestamp,MAC Address,Channel,DS Channel,Vendor Specific Tags,SSID,VHT Capabilities,HE Capabilities,Length,Label,...,Channel_Schedule_Management,Geodatabase_Inband_Enabling_Signal,Network_Channel_Control,White_Space_Map,Channel_Availability_Query,FTM_Responder,FTM_Initiator,Reserved_6,ESM_Capability,Future_Channel_Guidance
0,2023-05-27 11:38:16.184592962,1a:e6:5a:fe:34:4c,1,1.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,2023-05-27 11:38:16.205511093,1a:e6:5a:fe:34:4c,1,1.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,2023-05-27 11:38:16.205514908,1a:e6:5a:fe:34:4c,11,1.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,2023-05-27 11:38:16.232511997,1a:e6:5a:fe:34:4c,1,2.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,2023-05-27 11:38:16.232516050,1a:e6:5a:fe:34:4c,11,2.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76699,2021-06-03 13:54:07.334428072,da:a1:19:45:40:f0,11,13.0,0050f208006200,-1,-1,-1,119,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
76700,2021-06-03 13:54:07.340886116,da:a1:19:45:40:f0,11,13.0,0050f208006200,1117sx,-1,-1,125,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
76701,2021-06-03 13:54:07.341959000,da:a1:19:45:40:f0,11,13.0,0050f208006200,!op0ssum@,-1,-1,128,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
76702,2021-06-03 13:54:07.343002081,da:a1:19:45:40:f0,11,13.0,0050f208006200,Vodafone,-1,-1,127,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


Saving `combined_df` filled with `-1` on empty fields.

In [9]:
dissected_df.to_csv(
    config["DEFAULT"]["interim_path"] + "dissected/dissected_df.csv", index=False
)

In [10]:
dissected_df

Unnamed: 0,Timestamp,MAC Address,Channel,DS Channel,Vendor Specific Tags,SSID,VHT Capabilities,HE Capabilities,Length,Label,...,Channel_Schedule_Management,Geodatabase_Inband_Enabling_Signal,Network_Channel_Control,White_Space_Map,Channel_Availability_Query,FTM_Responder,FTM_Initiator,Reserved_6,ESM_Capability,Future_Channel_Guidance
0,2023-05-27 11:38:16.184592962,1a:e6:5a:fe:34:4c,1,1.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,2023-05-27 11:38:16.205511093,1a:e6:5a:fe:34:4c,1,1.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,2023-05-27 11:38:16.205514908,1a:e6:5a:fe:34:4c,11,1.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,2023-05-27 11:38:16.232511997,1a:e6:5a:fe:34:4c,1,2.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,2023-05-27 11:38:16.232516050,1a:e6:5a:fe:34:4c,11,2.0,-1,-1,-1,-1,111,iPhone7_F,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76699,2021-06-03 13:54:07.334428072,da:a1:19:45:40:f0,11,13.0,0050f208006200,-1,-1,-1,119,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
76700,2021-06-03 13:54:07.340886116,da:a1:19:45:40:f0,11,13.0,0050f208006200,1117sx,-1,-1,125,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
76701,2021-06-03 13:54:07.341959000,da:a1:19:45:40:f0,11,13.0,0050f208006200,!op0ssum@,-1,-1,128,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
76702,2021-06-03 13:54:07.343002081,da:a1:19:45:40:f0,11,13.0,0050f208006200,Vodafone,-1,-1,127,XiaomiRedmi4_B,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
