# Data Preparation Plots

This notebook contains some plots that are used in the Thesis for comparing the effects of the different data preparation steps and the augmentation.

In [84]:
import pandas as pd
import utilities.plotting as plots
import utilities.preprocess_raw_data as prep_data

In [85]:
df_raw = pd.read_parquet("./data/my_datasets/raw_merged.parquet")

In [86]:
DEV = df_raw["device_uuid"].unique()[0]
df_raw = df_raw[df_raw["device_uuid"] == DEV]

In [87]:
plots.plot_device(df_raw, DEV, color="#F00", title="Verlauf der Batterieladung (Rohdaten)")

In [88]:
df_daily_mean = prep_data.calculate_daily_mean(df_raw)
plots.plot_device(df_daily_mean, DEV, color="#F00", title="Verlauf der Batterieladung (täglicher Durchschnitt)")

In [89]:
df_median_smoothed = prep_data.smooth_df_using_median(df_daily_mean, window=5)
plots.plot_device(df_median_smoothed, DEV, color="#F00", title="Verlauf der Batterieladung (nach Medianfilter)")

In [90]:
df_labeled = prep_data.label_df_peaks(df_median_smoothed)
plots.plot_devices_and_peaks(
    df_labeled,
    color="#F00",
    title="Verlauf der Batterieladung mit eingezeichneten erkannten Austauschzeitpunkten",
)
plots.plot_device(
    df_labeled,
    DEV,
    col_to_plot="battery_diff",
    color="#00F",
    title="Steigung der Batterieentladungskurve",
    y_desc="Änderung der Batterieladung in %/Tag",
    fixed_y_axis=False,
)

In [91]:
df_cycles = prep_data.extract_cycles(df_labeled)
plots.plot_cycles(df_cycles, title="Verlauf der Batterieladung aufgetrennt nach Entladungszyklen")

In [92]:
df_filtered = prep_data.filter_outliers(df_cycles)
df_ffilled = prep_data.ffill_cycles(df_filtered)
plots.plot_cycles(df_ffilled, title="Verlauf der Batterieladung aufgetrennt nach Entladungszyklen")

In [93]:
df_sufficient = prep_data.drop_insufficient_data(
    df_ffilled,
    column="cycle_id",
    count_thresh=25,
    range_thresh=20,
)
plots.plot_cycles(
    df_sufficient, title="Verlauf der Batterieladung aufgetrennt nach Entladungszyklen"
)

In [119]:
df_sufficient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   status_time            453 non-null    datetime64[ns]
 1   device_uuid            453 non-null    object        
 2   battery_level_percent  453 non-null    float64       
 3   radio_level_percent    453 non-null    float64       
 4   air_temperature        453 non-null    float64       
 5   battery_type_id        453 non-null    float64       
 6   fw_version             453 non-null    object        
 7   device_model_code      453 non-null    object        
 8   battery_diff           453 non-null    float64       
 9   cycle_id               453 non-null    float64       
dtypes: datetime64[ns](1), float64(6), object(3)
memory usage: 35.5+ KB


## Augmentierung

In [94]:
df_base_10noise = prep_data.load_base_dataset(
    raw_merged_df=df_raw, add_noise=True, max_noise=10, add_noise_temperature=True, max_noise_temperature=5
)
plots.plot_devices(df_base_10noise, color="#F00")

In [115]:
df_base_5_time_warping = prep_data.load_base_dataset(
    raw_merged_df=df_raw, random_warp_status_times=True, random_max_time_warp_percent=20
)
plots.plot_devices(df_base_5_time_warping, color="#F00")

In [116]:
plots.plot_compare_multiple_dfs(
    [
        (df_median_smoothed, "battery_level_percent", DEV, "Unaugmentiert", True),
        (df_base_10noise, "battery_level_percent", DEV, "10% Jittering auf<br>Batterielevel", True),
    ],
    colors=["#F00", "#555", "#555"],
    title="Vergleich von unaugmentiertem Batterieverlauf zu Batterieverläufen mit Jittering"
)

In [117]:
plots.plot_compare_multiple_dfs(
    [
        (df_median_smoothed, "battery_level_percent", DEV, "Unaugmentiert", True),
        (df_base_5_time_warping, "battery_level_percent", DEV, "500% Jittering<br>auf Messabstände", True),
    ],
    colors=["#F00", "#555", "#555"],
    title="Vergleich von unaugmentiertem Batterieverlauf zu Batterieverläufen mit Jittering"
)

In [118]:
plots.plot_compare_multiple_dfs(
    [
        (df_median_smoothed, "air_temperature", DEV, "Unaugmentiert", True),
        (df_base_10noise, "air_temperature", DEV, "5% Jittering auf<br>Lufttemperatur", True),
    ],
    colors=["#F00", "#555"],
    fixed_y_axis=False,
    title="Vergleich von unaugmentierter Lufttemperatur zu Lufttemperatur mit 5% Jittering",
    y_desc="Lufttemperatur in °C",
)

In [105]:
df_surv = pd.read_parquet("./data/experiment_datasets/experiment_9/survival/test.parquet")

In [106]:
df_surv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38 entries, 0 to 44
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   device_uuid                  38 non-null     object 
 1   cycle_id                     38 non-null     float64
 2   duration                     38 non-null     int64  
 3   batt_min                     38 non-null     float64
 4   batt_max                     38 non-null     float64
 5   batt_diff                    38 non-null     float64
 6   batt_median                  38 non-null     float64
 7   daily_roc                    38 non-null     float64
 8   temp_max                     38 non-null     float64
 9   temp_diff                    38 non-null     float64
 10  temp_median                  38 non-null     float64
 11  radio_diff                   38 non-null     float64
 12  radio_median                 38 non-null     float64
 13  event                      

In [107]:
df_surv["n_fw_versions"] = df_surv[["fw_version_v01.70", "fw_version_v01.66", "fw_version_v01.49"]].sum(axis=1)
df_surv["n_fw_versions"].value_counts()

n_fw_versions
1    28
2    10
Name: count, dtype: int64

In [108]:
df_surv["n_fw_versions"] = df_surv[["fw_version_v01.70", "fw_version_v01.66", "fw_version_v01.49"]].sum(axis=1)
df_surv["n_fw_versions"].value_counts()

n_fw_versions
1    28
2    10
Name: count, dtype: int64