# Data Cleaning and Pre-Processing

## Libraries and Configurations

Import configuration files

In [None]:
from configparser import ConfigParser
import os

config = ConfigParser()
config.read("config.ini")

Import **data libraries**

In [None]:
import pandas as pd
import numpy
import matplotlib.pyplot as plt
import seaborn as sns

Import **other libraries**

In [None]:
from rich.progress import Progress, BarColumn, TextColumn
from rich import traceback

traceback.install()

Custom helper scripts

In [None]:
%cd ..
from scripts import plotHelper
%cd data_exploration_cleaning

## Import Data

In [None]:
# Base directory containing the folders
base_dir = config["DEFAULT"]["extracted_path"]

# Initialize an empty dictionary to store DataFrames
dataframes = {}

# Traverse the directory structure
for root, dirs, files in os.walk(base_dir):
    for file in files:
        if file.endswith(".csv"):
            # Construct the full file path
            file_path = os.path.join(root, file)

            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)

            # Store the DataFrame in the dictionary with a unique key (e.g., file name)
            dataframes[file] = df

Concatenating devices' dataframes and converting *Timestamp* column

In [None]:
combined_df = pd.concat(dataframes.values(), ignore_index=True)
combined_df["Timestamp"] = pd.to_datetime(combined_df["Timestamp"], unit="s")

Saving initial, concatenated and raw dataframe

In [None]:
combined_df.to_csv("../../data/interim/combined_df_raw.csv", index=False)

Filling with `-1` the empty fields

In [None]:
combined_df.fillna("-1", inplace=True)

In [None]:
combined_df

In [None]:
plotHelper.plot_label_distribution(combined_df, "Label", log_scale=True)

In [None]:
plotHelper.plot_label_distribution(combined_df, "HT Capabilities", log_scale=True)

In [None]:
plotHelper.plot_label_distribution(combined_df, "SSID", log_scale=True)

In [None]:
plotHelper.plot_heatmap(combined_df, "Label", "DS Channel")

In [None]:
plotHelper.plot_heatmap(combined_df, "Channel", "DS Channel")

In [None]:
plotHelper.plot_heatmap(combined_df, "Label", "Extended Capabilities")