# Data Preparation Plots

This notebook contains some plots that are used in the Thesis for comparing the effects of the different data preparation steps and the augmentation.

In [2]:
import pandas as pd
import utilities.plotting as plots
import utilities.preprocess_raw_data as prep_data

In [3]:
df_raw = pd.read_parquet("./data/my_datasets/raw_merged.parquet")

In [4]:
DEV = df_raw["device_uuid"].unique()[0]
df_raw = df_raw[df_raw["device_uuid"] == DEV]

In [5]:
plots.plot_device(df_raw, DEV, color="#F00", title="Verlauf der Batterieladung (Rohdaten)")

In [6]:
df_noise = prep_data.add_noise_to_devices(df_raw, max_deviation=10)

In [7]:
plots.plot_compare_multiple_dfs(
    [
        (df_raw, "battery_level_percent", DEV, "Unaugmentiert", True),
        (df_noise, "battery_level_percent", DEV, "10% Jittering auf<br>Batterielevel", True),
    ],
    colors=["#F00", "#555", "#555"],
    title="Vergleich von unaugmentiertem Batterieverlauf zu Batterieverläufen mit Jittering auf Rohdaten"
)

In [8]:
df_raw_smoothed = prep_data.smooth_df_using_median(prep_data.calculate_daily_mean(df_raw))
df_noise_smoothed = prep_data.smooth_df_using_median(prep_data.calculate_daily_mean(df_noise))

In [9]:
plots.plot_compare_multiple_dfs(
    [
        (df_raw_smoothed, "battery_level_percent", DEV, "Unaugmentiert", True),
        (df_noise_smoothed, "battery_level_percent", DEV, "10% Jittering auf<br>Batterielevel", True),
    ],
    colors=["#F00", "#555", "#555"],
    title="Vergleich von unaugmentiertem Batterieverlauf zu Batterieverläufen mit Jittering nach Smoothing"
)

In [5]:
df_daily_mean = prep_data.calculate_daily_mean(df_raw)
plots.plot_device(df_daily_mean, DEV, color="#F00", title="Verlauf der Batterieladung (täglicher Durchschnitt)")

In [6]:
df_median_smoothed = prep_data.smooth_df_using_median(df_daily_mean, window=5)
plots.plot_device(df_median_smoothed, DEV, color="#F00", title="Verlauf der Batterieladung (nach Medianfilter)")

In [7]:
df_labeled = prep_data.label_df_peaks(df_median_smoothed)
plots.plot_devices_and_peaks(
    df_labeled,
    color="#F00",
    title="Verlauf der Batterieladung mit eingezeichneten erkannten Austauschzeitpunkten",
)
plots.plot_device(
    df_labeled,
    DEV,
    col_to_plot="battery_diff",
    color="#00F",
    title="Steigung der Batterieentladungskurve",
    y_desc="Änderung der Batterieladung in %/Tag",
    fixed_y_axis=False,
)

In [8]:
df_cycles = prep_data.extract_cycles(df_labeled)
plots.plot_cycles(df_cycles, title="Verlauf der Batterieladung aufgetrennt nach Entladungszyklen")

In [9]:
df_filtered = prep_data.filter_outliers(df_cycles)
df_ffilled = prep_data.ffill_cycles(df_filtered)
plots.plot_cycles(df_ffilled, title="Verlauf der Batterieladung aufgetrennt nach Entladungszyklen")

In [10]:
df_sufficient = prep_data.drop_insufficient_data(
    df_ffilled,
    column="cycle_id",
    count_thresh=25,
    range_thresh=20,
)
plots.plot_cycles(
    df_sufficient, title="Verlauf der Batterieladung aufgetrennt nach Entladungszyklen"
)

In [11]:
df_sufficient.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 453 entries, 0 to 452
Data columns (total 10 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   status_time            453 non-null    datetime64[ns]
 1   device_uuid            453 non-null    object        
 2   battery_level_percent  453 non-null    float64       
 3   radio_level_percent    453 non-null    float64       
 4   air_temperature        453 non-null    float64       
 5   battery_type_id        453 non-null    float64       
 6   fw_version             453 non-null    object        
 7   device_model_code      453 non-null    object        
 8   battery_diff           453 non-null    float64       
 9   cycle_id               453 non-null    float64       
dtypes: datetime64[ns](1), float64(6), object(3)
memory usage: 35.5+ KB


## Augmentierung

In [12]:
df_base_10noise = prep_data.load_base_dataset(
    raw_merged_df=df_raw, add_noise=True, max_noise=10, add_noise_temperature=True, max_noise_temperature=5
)
plots.plot_devices(df_base_10noise, color="#F00")

In [13]:
df_base_5_time_warping = prep_data.load_base_dataset(
    raw_merged_df=df_raw, random_warp_status_times=True, random_max_time_warp_percent=20
)
plots.plot_devices(df_base_5_time_warping, color="#F00")

In [14]:
plots.plot_compare_multiple_dfs(
    [
        (df_median_smoothed, "battery_level_percent", DEV, "Unaugmentiert", True),
        (df_base_10noise, "battery_level_percent", DEV, "10% Jittering auf<br>Batterielevel", True),
    ],
    colors=["#F00", "#555", "#555"],
    title="Vergleich von unaugmentiertem Batterieverlauf zu Batterieverläufen mit Jittering"
)

In [15]:
plots.plot_compare_multiple_dfs(
    [
        (df_median_smoothed, "battery_level_percent", DEV, "Unaugmentiert", True),
        (df_base_5_time_warping, "battery_level_percent", DEV, "500% Jittering<br>auf Messabstände", True),
    ],
    colors=["#F00", "#555", "#555"],
    title="Vergleich von unaugmentiertem Batterieverlauf zu Batterieverläufen mit Jittering"
)

In [16]:
plots.plot_compare_multiple_dfs(
    [
        (df_median_smoothed, "air_temperature", DEV, "Unaugmentiert", True),
        (df_base_10noise, "air_temperature", DEV, "5% Jittering auf<br>Lufttemperatur", True),
    ],
    colors=["#F00", "#555"],
    fixed_y_axis=False,
    title="Vergleich von unaugmentierter Lufttemperatur zu Lufttemperatur mit 5% Jittering",
    y_desc="Lufttemperatur in °C",
)

In [17]:
df_surv = pd.read_parquet("./data/experiment_datasets/experiment_9/survival/test.parquet")

In [18]:
df_surv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38 entries, 0 to 44
Data columns (total 22 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   device_uuid                  38 non-null     object 
 1   cycle_id                     38 non-null     float64
 2   duration                     38 non-null     int64  
 3   batt_min                     38 non-null     float64
 4   batt_max                     38 non-null     float64
 5   batt_diff                    38 non-null     float64
 6   batt_median                  38 non-null     float64
 7   daily_roc                    38 non-null     float64
 8   temp_max                     38 non-null     float64
 9   temp_diff                    38 non-null     float64
 10  temp_median                  38 non-null     float64
 11  radio_diff                   38 non-null     float64
 12  radio_median                 38 non-null     float64
 13  event                      

In [19]:
df_surv["n_fw_versions"] = df_surv[["fw_version_v01.70", "fw_version_v01.66", "fw_version_v01.49"]].sum(axis=1)
df_surv["n_fw_versions"].value_counts()

n_fw_versions
1    28
2    10
Name: count, dtype: int64

In [20]:
df_surv["n_fw_versions"] = df_surv[["fw_version_v01.70", "fw_version_v01.66", "fw_version_v01.49"]].sum(axis=1)
df_surv["n_fw_versions"].value_counts()

n_fw_versions
1    28
2    10
Name: count, dtype: int64

### Iterative prediction on censored cycle

In [21]:
from cross_validate_regression import load_model
from utilities.generate_regression_dataset import base_to_regression_dataset

In [22]:
df_raw = pd.read_parquet("./data/my_datasets/raw_merged.parquet")
df_base = prep_data.load_base_dataset(raw_merged_df=df_raw)

In [23]:
df_reg = base_to_regression_dataset(df_base)

Dropped 4204 rows in 29 cycles.
153 cycles left.


In [24]:
df_cycle_2 = df_reg[df_reg["cycle_id"] == 2]

In [78]:
plots.plot_device(df_cycle_2, DEV, color="#09D09D", title="Batterieladung für Cycle 2.")

In [26]:
df_cycle_2.columns

Index(['status_time', 'device_uuid', 'battery_level_percent',
       'radio_level_percent', 'air_temperature', 'battery_diff', 'cycle_id',
       'battery_level_percent_rolling_median_5',
       'battery_diff_rolling_median_5',
       'battery_level_percent_rolling_median_50',
       'battery_diff_rolling_median_50', 'fw_version_v01.49',
       'fw_version_v01.66', 'fw_version_v01.70', 'battery_type_id_1.0',
       'battery_type_id_2.0', 'device_model_code_0572 2620',
       'device_model_code_0572 2621', 'device_model_code_0572 2622',
       'device_model_code_0572 2623', 'target'],
      dtype='object')

In [27]:
rename_mapper = {
    "fw_version_v01.49": "v01.49",
    "fw_version_v01.66": "v01.66",
    "fw_version_v01.70": "v01.70",
    "battery_type_id_1.0": "1.0",
    "battery_type_id_1.2": "1.2",
    "device_model_code_0572 2620": "0572 2620",
    "device_model_code_0572 2621": "0572 2621",
    "device_model_code_0572 2622": "0572 2622",
    "device_model_code_0572 2623": "0572 2623",
}
df_cycle_2 = df_cycle_2.rename(columns=rename_mapper)

In [28]:
df_reg_runs = pd.read_csv("./data/runs/reg_runs_10.csv")
best_linear = df_reg_runs[df_reg_runs["params.model_class"] == "LinearRegression"].sort_values(by="metrics.mdt").iloc[0]

In [29]:
best_linear_model = load_model(best_linear["artifact_uri"])
best_linear_model

In [30]:
from utilities.train_regression_model import iterative_prediction

In [37]:
df_cycle_2.columns.tolist()

['status_time',
 'device_uuid',
 'battery_level_percent',
 'radio_level_percent',
 'air_temperature',
 'battery_diff',
 'cycle_id',
 'battery_level_percent_rolling_median_5',
 'battery_diff_rolling_median_5',
 'battery_level_percent_rolling_median_50',
 'battery_diff_rolling_median_50',
 'v01.49',
 'v01.66',
 'v01.70',
 '1.0',
 'battery_type_id_2.0',
 '0572 2620',
 '0572 2621',
 '0572 2622',
 '0572 2623',
 'target']

In [70]:
import random

class FakeModel:
    feature_names_in_ = [
        "battery_level_percent",
        "radio_level_percent",
        "air_temperature",
        "battery_diff",
        "cycle_id",
        "battery_level_percent_rolling_median_5",
        "battery_diff_rolling_median_5",
        "battery_level_percent_rolling_median_50",
        "battery_diff_rolling_median_50",
        "v01.49",
        "v01.66",
        "v01.70",
        "1.0",
        "battery_type_id_2.0",
        "0572 2620",
        "0572 2621",
        "0572 2622",
        "0572 2623",
    ]

    def predict(in_df):
        diff_rm_5 = in_df["battery_level_percent_rolling_median_5"]
        this = in_df["battery_level_percent"]
        return this + random.uniform(-1.5, -0.4)

In [74]:
df_pred = iterative_prediction(model=FakeModel, input_df=df_cycle_2, n_predictions=50, prediction_horizon=best_linear["params.pred_hor"])

In [81]:
plots.plot_compare_multiple_dfs(
    [
        (df_pred, "battery_level_percent", DEV, "Vorhersage", False),
        (df_cycle_2, "battery_level_percent", DEV, "Zensierte<br>Kurve", False),
    ],
    colors=["red", "#09D09D"],
    title="Iterative Vorhersage auf zensiertem Zyklus."
)

In [33]:
df_pred.tail(20)

Unnamed: 0,status_time,device_uuid,battery_level_percent,radio_level_percent,air_temperature,battery_diff,cycle_id,battery_level_percent_rolling_median_5,battery_diff_rolling_median_5,battery_level_percent_rolling_median_50,...,v01.49,v01.66,v01.70,1.0,battery_type_id_2.0,0572 2620,0572 2621,0572 2622,0572 2623,target
23,2023-08-07,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,55.0,95.0,2.80983,-2.0,2.0,58.0,-2.0,84.25,...,False,True,False,True,False,True,False,False,False,46.5
24,2023-08-08,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,55.0,90.0,2.040802,0.0,2.0,57.0,-2.0,84.0,...,False,True,False,True,False,True,False,False,False,42.2
25,2023-08-09,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,54.75,100.0,1.811836,-0.25,2.0,55.0,-1.0,82.5,...,False,True,False,True,False,True,False,False,False,42.0
26,2023-08-10,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,54.0,76.25,1.960236,-0.75,2.0,55.0,-0.75,81.0,...,False,True,False,True,False,True,False,False,False,40.5
27,2023-08-11,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,52.75,88.75,0.506012,-1.25,2.0,54.75,-0.75,80.833333,...,False,True,False,True,False,True,False,False,False,39.5
28,2023-08-12,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,51.2,91.0,1.34574,-1.55,2.0,54.0,-0.75,80.666667,...,False,True,False,True,False,True,False,False,False,39.25
29,2023-08-13,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,50.5,76.25,1.036865,-0.7,2.0,52.75,-0.75,80.583333,...,False,True,False,True,False,True,False,False,False,38.428571
30,2023-08-14,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,50.25,93.75,0.600945,-0.25,2.0,51.2,-0.75,80.5,...,False,True,False,True,False,True,False,False,False,35.75
31,2023-08-15,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,47.5,100.0,0.792465,-2.75,2.0,50.5,-1.25,79.5,...,False,True,False,True,False,True,False,False,False,34.0
32,2023-08-16,00082fe6-1ce0-43a0-ae8b-f42a36f5b2cc,47.0,87.5,0.951019,-0.5,2.0,50.25,-0.7,78.5,...,False,True,False,True,False,True,False,False,False,33.75
