# Regression iterative Prediction Analysis

This notebook analyses the predicted Discharge curves of the iterative Regression Models and finds some interesting ones for the Thesis.

In [1]:
import pandas as pd
import random
import plotly.express as px

import utilities.generate_regression_dataset as gen_reg
import utilities.train_regression_model as train_reg
import utilities.plotting as plots
from cross_validate_regression import augmented_base_dataset
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [2]:
raw_merged_df = pd.read_parquet("./data/my_datasets/raw_merged.parquet")

In [3]:
device_uuids = raw_merged_df["device_uuid"].unique().tolist()
train_uuids = list(random.sample(device_uuids, 63))
test_uuids = list(set(device_uuids) - set(train_uuids))

In [4]:
def sort_dict_by_values(x: dict, ascending=True) -> dict:
    return dict(sorted(x.items(), key=lambda item: item[1], reverse=not ascending))

In [5]:
def predict_and_plot_cycle(model, unaugmented_model, cycle: int, n_inputs: int):
    test_cycle = test_df[test_df["cycle_id"] == cycle]

    base_test_cycle = base_test_df[base_test_df["cycle_id"] == cycle]
    for feature in base_test_cycle.columns.drop(
        ["status_time", "device_uuid", "cycle_id"]
    ):
        print(f"{feature} Mean:")
        try:
            print(base_test_cycle[feature].mean())
        except:
            print(base_test_cycle[feature].unique()[0])

    test_cycle_len = len(test_cycle)
    n_predictions = test_cycle_len - n_inputs
    prediction_start_date = test_cycle.iloc[n_inputs]["status_time"]
    predictions = train_reg.iterative_prediction(
        model,
        test_cycle.head(n_inputs),
        n_predictions=n_predictions,
        prediction_horizon=1,
    )
    unaugmented_predictions = train_reg.iterative_prediction(
        unaugmented_model,
        test_cycle.head(n_inputs),
        n_predictions=n_predictions,
        prediction_horizon=1,
    )
    plots.plot_iterative_prediction(
        test_df=test_cycle,
        pred_dfs=[(predictions, "Augmented"), (unaugmented_predictions, "Unaugmented")],
        prediction_start_date=prediction_start_date,
        title=f"Iterative Vorhersage von {model.__class__.__name__} für Cycle {int(cycle)}",
        divergence_threshold=10,
        size=(1000, 400),
    )

In [9]:
reg_cv_df = pd.read_csv("./data/cross_validation/reg_cross_validation.csv")
reg_cv_df = reg_cv_df[reg_cv_df["params.n_dev"] == 63]

n_inputs = 70

models = {}

for model_class, model_group in reg_cv_df.groupby(by="params.model_class"):
    model_group.sort_values(by="metrics.mdt", ascending=False, inplace=True)
    best = model_group[model_group["params.n_aug"] != 0].iloc[0]
    max_jittering_battery_level = best["params.max_noise"]
    max_jittering_air_temperature = best["params.max_noise_temperature"]
    max_jittering_measurement_interval = best["params.random_max_time_warp_percent"]

    train_df_aug_params = {
        "add_noise": max_jittering_battery_level != 0,
        "max_noise": max_jittering_battery_level,
        "add_noise_temperature": max_jittering_air_temperature != 0,
        "max_noise_temperature": max_jittering_air_temperature,
        "random_warp_status_times": max_jittering_measurement_interval != 0,
        "random_max_time_warp_percent": max_jittering_measurement_interval,
    }

    base_train_df = augmented_base_dataset(
        raw_merged_df,
        n_aug=1,
        device_subset=train_uuids,
        params=train_df_aug_params,
    )
    unaugmented_base_train_df = augmented_base_dataset(
        raw_merged_df,
        n_aug=0,
        device_subset=train_uuids,
    )

    base_test_df = augmented_base_dataset(
        raw_merged_df,
        n_aug=0,
        device_subset=test_uuids,
    )

    train_df = gen_reg.base_to_regression_dataset(base_train_df, prediction_horizon=1)
    unaugmented_train_df = gen_reg.base_to_regression_dataset(
        unaugmented_base_train_df, prediction_horizon=1
    )
    test_df = gen_reg.base_to_regression_dataset(base_test_df, prediction_horizon=1)
    features = train_df.columns.drop(
        ["status_time", "device_uuid", "cycle_id", "target"]
    )
    features = list(
        set(features).intersection(
            test_df.columns.drop(["status_time", "device_uuid", "cycle_id", "target"])
        )
    )
    X_train, y_train = train_reg.df_to_X_y(train_df, features=features)
    unaugmented_X_train, unaugmented_y_train = train_reg.df_to_X_y(
        unaugmented_train_df, features=features
    )
    X_test, y_test = train_reg.df_to_X_y(test_df, features=features)

    if model_class == "LinearRegression":
        model = LinearRegression()
        unaugmented_model = LinearRegression()
    elif model_class == "DecisionTreeRegressor":
        model = DecisionTreeRegressor()
        unaugmented_model = DecisionTreeRegressor()
    else:
        model = XGBRegressor()
        unaugmented_model = XGBRegressor()

    print(model.__class__.__name__)
    model.fit(X_train, y_train)
    unaugmented_model.fit(unaugmented_X_train, unaugmented_y_train)

    models[f"{model.__class__.__name__}_aug"] = model
    models[f"{model.__class__.__name__}_unaug"] = unaugmented_model

    # Augmented
    _, _, _, _, cycle_times_dict, _ = train_reg.divergence_time_metrics(
        model, test_df, prediction_horizon=1, input_size=n_inputs
    )
    cycle_times_dict = {k: v[0] for k, v in cycle_times_dict.items()}

    # Unaugmented
    _, _, _, _, unaugmented_cycle_times, _ = train_reg.divergence_time_metrics(
        unaugmented_model, test_df, prediction_horizon=1, input_size=n_inputs
    )
    unaugmented_cycle_times = {k: v[0] for k, v in unaugmented_cycle_times.items()}

    for cycle in list(sort_dict_by_values(cycle_times_dict, ascending=False).keys())[
        0:3
    ]:
        predict_and_plot_cycle(model, unaugmented_model, cycle, n_inputs=n_inputs)

    counter = 0
    for cycle in list(sort_dict_by_values(cycle_times_dict).keys()):
        if cycle_times_dict[cycle] > 50:
            continue
        if len(test_df[test_df["cycle_id"] == cycle]) < 2 * n_inputs:
            continue
        counter += 1
        if counter > 3:
            break
        predict_and_plot_cycle(model, unaugmented_model, cycle, n_inputs=n_inputs)

    print("\n\n")

Dropped 1175 rows in 7 cycles.
134 cycles left.
Dropped 3005 rows in 16 cycles.
95 cycles left.
Dropped 1118 rows in 12 cycles.
59 cycles left.
DecisionTreeRegressor
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the

battery_level_percent Mean:
88.78605283605283
radio_level_percent Mean:
99.91663144547759
air_temperature Mean:
-7.733365782885096
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.13979289940828402
Adding divergence line
Adding divergence line


battery_level_percent Mean:
82.92674094707522
radio_level_percent Mean:
99.59261838440112
air_temperature Mean:
-8.590999408221988
battery_type_id Mean:
2.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.09976787372330546
Adding divergence line
Adding divergence line


battery_level_percent Mean:
56.022327044025154
radio_level_percent Mean:
54.26519916142557
air_temperature Mean:
-18.75360610990566
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2620
battery_diff Mean:
0.0
Adding divergence line
Adding divergence line


battery_level_percent Mean:
36.534535308441555
radio_level_percent Mean:
82.92157512626264
air_temperature Mean:
4.100741950094923
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.5125
Adding divergence line
Adding divergence line


battery_level_percent Mean:
77.06902718168814
radio_level_percent Mean:
65.93526466380543
air_temperature Mean:
-13.815014642993564
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-1.0861230329041482
Adding divergence line
Adding divergence line





Dropped 1465 rows in 6 cycles.
129 cycles left.
Dropped 3005 rows in 16 cycles.
95 cycles left.
Dropped 1118 rows in 12 cycles.
59 cycles left.
LinearRegression
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the p

battery_level_percent Mean:
70.31061130334487
radio_level_percent Mean:
100.0
air_temperature Mean:
-21.485361970530565
battery_type_id Mean:
2.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2623
battery_diff Mean:
0.10495963091118798
Adding divergence line
Adding divergence line


battery_level_percent Mean:
86.89957562122916
radio_level_percent Mean:
99.8312710911136
air_temperature Mean:
4.037306409613687
battery_type_id Mean:
2.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.08923884514435697
Adding divergence line
Adding divergence line


battery_level_percent Mean:
56.022327044025154
radio_level_percent Mean:
54.26519916142557
air_temperature Mean:
-18.75360610990566
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2620
battery_diff Mean:
0.0
Adding divergence line
Adding divergence line


battery_level_percent Mean:
91.36389541309754
radio_level_percent Mean:
96.87757015443185
air_temperature Mean:
-16.095591204205014
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2620
battery_diff Mean:
0.0007978723404255319
Adding divergence line
Adding divergence line


battery_level_percent Mean:
75.0109387755102
radio_level_percent Mean:
99.97214285714286
air_temperature Mean:
1.9857705623643298
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.37542857142857144
Adding divergence line
Adding divergence line





Dropped 1708 rows in 6 cycles.
133 cycles left.
Dropped 3005 rows in 16 cycles.
95 cycles left.
Dropped 1118 rows in 12 cycles.
59 cycles left.
XGBRegressor
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the prediction horizon.
The cycle length is smaller than the predi

battery_level_percent Mean:
88.78605283605283
radio_level_percent Mean:
99.91663144547759
air_temperature Mean:
-7.733365782885096
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.13979289940828402
Adding divergence line
Adding divergence line


battery_level_percent Mean:
85.40794066317628
radio_level_percent Mean:
81.89497631513339
air_temperature Mean:
2.571972832723791
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.2940663176265271
Adding divergence line
Adding divergence line


battery_level_percent Mean:
56.022327044025154
radio_level_percent Mean:
54.26519916142557
air_temperature Mean:
-18.75360610990566
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2620
battery_diff Mean:
0.0
Adding divergence line
Adding divergence line


battery_level_percent Mean:
59.342152575315836
radio_level_percent Mean:
99.953231292517
air_temperature Mean:
8.76860171649336
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-0.5827664399092971
Adding divergence line
Adding divergence line


battery_level_percent Mean:
77.06902718168814
radio_level_percent Mean:
65.93526466380543
air_temperature Mean:
-13.815014642993564
battery_type_id Mean:
1.0
fw_version Mean:
v01.49
device_model_code Mean:
0572 2622
battery_diff Mean:
-1.0861230329041482
Adding divergence line
Adding divergence line







In [21]:
all_cycle_times = []
for model_name, model in models.items():
    print(model_name)
    _, _, _, _, cycle_times_dict, _ = train_reg.divergence_time_metrics(
        model, test_df, prediction_horizon=1, input_size=-1
    )

    cycle_times_list = []
    for cycle, cycle_times in cycle_times_dict.items():
        for cycle_time in cycle_times:
            cycle_times_list.append(
                {
                    "Model": model.__class__.__name__,
                    "DT": cycle_time,
                    "Augmentiert": "unaug" not in model_name,
                }
            )
    all_cycle_times.append(pd.DataFrame(cycle_times_list))
    
cycle_times_df = pd.concat(all_cycle_times)

DecisionTreeRegressor_aug
DecisionTreeRegressor_unaug
LinearRegression_aug
LinearRegression_unaug
XGBRegressor_aug
XGBRegressor_unaug


In [24]:
fig = px.box(
    cycle_times_df,
    y="Model",
    x="DT",
    points="all",
    color="Augmentiert",
    height=600,
    width=1000,
    title="Vergleich von Abweichungszeiten der drei Modelltypen mit und ohne Augmentation",
)
fig.update_yaxes(title="Modelltyp")
fig.update_xaxes(title="Abweichungszeit")
fig.show()